Analyzing and Predicting Airline Ticket Prices Using Python
Flight ticket prices are influenced by multiple factors, including airline, route, number of stops, departure and arrival times, flight duration, and booking time. By analyzing these elements, airlines can optimize pricing strategies to enhance competitiveness, while passengers can benefit from price prediction services to purchase tickets at optimal times.
This analysis employs Python for exploratory data analysis (EDA) and predictive modeling on flight price data. The goal is to identify key price determinants and build accurate forecasting models.
Data Overview The dataset includes flight details such as airline, source and destination cities, total stops, price, date, month, year, departure and arrival times, and duration. Initial steps involve loading and inspecting the data.
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import networkx as nx
flight_data = pd.read_csv('flight_dataset.csv')
print(flight_data.head())
Data Cleaning and Preparation Perform descriptive statistics and check data types.
print(flight_data.describe())
print(flight_data.info())
Identify and handle missing values.
print(flight_data.isnull().sum())
Remove duplicate entries.
flight_data.drop_duplicates(inplace=True)
Check data shape.
print(flight_data.shape)
Eliminate outliers from numerical columns using the interquartile range (IQR) method.
def remove_outliers(dataframe, column):
Q1 = dataframe[column].quantile(0.25)
Q3 = dataframe[column].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
return dataframe[(dataframe[column] >= lower_bound) & (dataframe[column] <= upper_bound)]
flight_data = remove_outliers(flight_data, 'Price')
flight_data = remove_outliers(flight_data, 'Duration_hours')
flight_data = remove_outliers(flight_data, 'Total_Stops')
Exploratory Data Enalysis Visualize distributions and relationships.
Box plots for numerical features.
numeric_features = flight_data.select_dtypes(include=['int64', 'float64'])
for feature in numeric_features.columns:
plt.figure(figsize=(10, 6))
sns.boxplot(data=flight_data, y=feature)
plt.title(f'Box Plot of {feature}')
plt.show()
Histogram of ticket prices.
plt.figure(figsize=(12, 8))
sns.histplot(flight_data['Price'], kde=True, color='blue')
plt.title('Distribution of Flight Prices')
plt.show()
Bar chart of airline frequencies.
flight_data['Airline'].value_counts().plot(kind='bar', figsize=(10, 6))
plt.title('Flight Counts by Airline')
plt.show()
Analyze popular routes using a network graph.
route_counts = flight_data.groupby(['Source', 'Destination']).size().reset_index(name='Frequency')
route_counts = route_counts.sort_values(by='Frequency', ascending=False)
G = nx.from_pandas_edgelist(route_counts, 'Source', 'Destination', ['Frequency'], create_using=nx.DiGraph())
plt.figure(figsize=(15, 10))
pos = nx.spring_layout(G, k=0.5, iterations=50)
nx.draw_networkx_nodes(G, pos, node_size=3000, node_color='lightgreen', edgecolors='black')
nx.draw_networkx_edges(G, pos, arrowstyle='-|>', arrowsize=20, edge_color='gray', width=2)
nx.draw_networkx_labels(G, pos, font_size=12, font_weight='bold')
edge_labels = nx.get_edge_attributes(G, 'Frequency')
nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=10, font_color='red')
plt.title('Airline Route Network')
plt.axis('off')
plt.show()
Examine relationships between price and other factors.
Average price by airline.
airline_avg_price = flight_data.groupby('Airline')['Price'].mean().sort_values(ascending=False)
plt.figure(figsize=(10, 6))
sns.barplot(x=airline_avg_price.index, y=airline_avg_price.values, palette='viridis')
plt.xticks(rotation=90)
plt.title('Average Price by Airline')
plt.show()
Average price by route.
route_avg_price = flight_data.groupby(['Source', 'Destination'])['Price'].mean().reset_index(name='Average_Price')
route_avg_price = route_avg_price.sort_values(by='Average_Price', ascending=False)
plt.figure(figsize=(12, 8))
sns.barplot(x=route_avg_price['Source'] + ' to ' + route_avg_price['Destination'], y=route_avg_price['Average_Price'], palette='Set2')
plt.xticks(rotation=45)
plt.title('Average Price by Route')
plt.show()
Scatter plot of flight duration versus price.
flight_data_sorted = flight_data.sort_values(by='Duration_hours')
plt.figure(figsize=(12, 8))
plt.scatter(flight_data_sorted['Duration_hours'], flight_data_sorted['Price'], alpha=0.6, color='purple')
plt.xlabel('Flight Duration (hours)')
plt.ylabel('Price')
plt.title('Flight Duration vs Price')
plt.grid(True)
plt.show()
Correlation heatmap.
numeric_data = flight_data.select_dtypes(include=['int64', 'float64'])
plt.figure(figsize=(10, 8))
sns.heatmap(numeric_data.corr(), annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Heatmap of Numerical Features')
plt.show()
Predictive Modeling Prepare data for machine learning by normalizing numerical features and encoding categorical variables.
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
numerical_features = ['Total_Stops', 'Date', 'Month', 'Dep_hours', 'Dep_min', 'Arrival_hours', 'Arrival_min', 'Duration_hours']
categorical_features = flight_data.select_dtypes(include=['object']).columns.tolist()
categorical_features.remove('Price') if 'Price' in categorical_features else None
preprocessor = ColumnTransformer(
transformers=[
('num', MinMaxScaler(), numerical_features),
('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])
X = flight_data.drop(columns=['Price'])
y = flight_data['Price']
Split data and train multiple regression models.
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
regression_models = {
'Linear Regression': LinearRegression(),
'Ridge Regression': Ridge(),
'Lasso Regression': Lasso(),
'Decision Tree': DecisionTreeRegressor(random_state=42),
'Random Forest': RandomForestRegressor(random_state=42),
'Gradient Boosting': GradientBoostingRegressor(random_state=42)
}
for name, model in regression_models.items():
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', model)])
pipeline.fit(X_train, y_train)
predictions = pipeline.predict(X_test)
mse = mean_squared_error(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)
r2 = r2_score(y_test, predictions)
print(f'{name}: MSE={mse:.2f}, MAE={mae:.2f}, R2={r2:.2f}')
Results indicate that Random Forest and Gradient Boosting models generally offer better predictive accuracy, followed by Decision Tree and linear models.
Key Findings
- Primary Price Influencers: Flight prices are significantly affected by airline, route, number of stops, and flight duration. Variations exist across different airlines and routes, with stops and duration showing strong correlations with price.
- Model Effectiveness: Machine learning models, particularly ensemble methods like Random Forest, provide reliable price predictions. Model tuning and validation can further enhance accuracy.
- Data-Driven Pricing: Airlines can leverage these insights to develop dynamic pricing strategies, adjusting fares based on market demand and competitive conditions to improve profitability.
- Passenger Recommendations: Price prediction models assist passengers in identifying optimal booking times to avoid peak pricing and reduce travel costs.