Data Processing and Model Training Pipeline for Deep Learning Applications
Data Preparation
Begin by creating a duplicate of the original dataset to prevent contamination. Identify missing values using visualizations like heatmaps, and remove redundant fields.
import numpy as np
import pandas as pd
# Find symmetric difference between two lists
list_a = ["tom","name",3]
list_b = [7,"name2",3]
print(np.setxor1d(list_a,list_b)) # ['7' 'name' 'name2' 'tom']
# Transform column names using mapping dictionary
col_mapping = {'NY8Y9': 'N1819', 'N2N29': 'N2029', 'N3N39': 'N3039',
'N4N49': 'N4049', 'N5N59': 'N5059', 'N6N64': 'N6064'}
def transform_columns(col_name):
return col_mapping.get(col_name, col_name)
dataset.columns = dataset.columns.map(transform_columns)
Translate English column headers to Chinese equivalents:
# Create translation dictionary from feature metadata
translation_map = dict(feature_metadata[['english_name','chinese_name']].values)
def translate_headers(df):
df_copy = df.copy()
df_copy.columns = pd.Series(df_copy.columns).map(translation_map)
return df_copy
# Apply translation to first 5 columns
translated_data = translate_headers(dataset[feature_metadata.english_name[:5].tolist()])
Examine field characteristics including data types, value distributions, and missing value counts:
def analyze_fields(dataframe):
for column in dataframe.columns:
print("Field name:", column)
print("Data type:", dataframe[column].dtype)
print("Value distribution:")
print(dataframe[column].value_counts())
print("Missing values:", dataframe[column].isnull().sum())
print("---")
Visualize categorical distributions and continuous variable deensity plots:
import seaborn as sns
import matplotlib.pyplot as plt
# Configure Chinese font support
sns.set_style("darkgrid", {"font.sans-serif": ['simhei','Droid Sans Fallback']})
plt.figure(figsize=(6,2))
sns.countplot(y='purchase_indicator', data=translated_data)
plt.show()
# Density plot by age groups
sns.kdeplot(translated_data.age[translated_data.purchase_indicator==1], label='Purchased')
sns.kdeplot(translated_data.age[translated_data.purchase_indicator==0], label='Not Purchased')
sns.kdeplot(translated_data.age.dropna(), label='All Records')
plt.xlim([60,90])
plt.xlabel('Age')
plt.ylabel('Density')
Data Cleaning Operations
Convert categorical variables to numerical representations:
def encode_categorical_features(df):
df_encoded = df.copy()
for column in df_encoded.columns:
if df_encoded[column].dtype == 'object':
unique_values = list(df_encoded[column].value_counts().index)
encoding_dict = dict(zip(unique_values, range(len(unique_values))))
df_encoded[column] = df_encoded[column].map(encoding_dict)
return df_encoded
# Alternative method using scikit-learn
from sklearn.preprocessing import OrdinalEncoder
encoded_array = OrdinalEncoder().fit_transform(categorical_features)
Identify highly correlated features for potential removal:
def find_highly_correlated_features(dataframe, threshold=0.65):
correlation_matrix = dataframe.corr().abs()
high_corr_columns = []
for column in correlation_matrix.columns:
if (correlation_matrix[column] > threshold).sum() >= 2:
high_corr_columns.append(column)
return high_corr_columns
# Visualize correlations
sns.heatmap(encoded_data.corr(), cmap='Blues')
Remove specified columns and duplicate records:
columns_to_remove = ["KBM_INDV_ID","U18","POEP","AART","AHCH","AASN","COLLEGE",
"INVE","c210cip","c210hmi","c210hva","c210kses","c210blu",
"c210bpvt","c210poo","KBM_INDV_ID","meda"]
cleaned_dataset = dataset.drop(columns=columns_to_remove)
cleaned_dataset = cleaned_dataset.drop_duplicates()
Dataset Partitioning
Split data into training and testing subsets before performing imputation or encoding:
from sklearn.model_selection import train_test_split
target_variable = cleaned_dataset.pop('response_flag')
feature_variables = cleaned_dataset
X_train, X_test, y_train, y_test = train_test_split(
feature_variables, target_variable, test_size=0.3, random_state=100
)
# Create working copies
train_features = X_train.copy()
test_features = X_test.copy()
train_labels = y_train.copy()
test_labels = y_test.copy()
Handle mising values by filling with median and mode statistics:
# Fill numeric columns with median values
numeric_columns = ["age","c210mah","c210b200","c210psu","c210wht","ilor"]
median_values = train_features[numeric_columns].median()
median_dict = dict(zip(median_values.index, median_values))
train_features = train_features.fillna(median_dict)
# Fill categorical columns with mode values
categorical_columns = ["N1819","ASKN","MOBPLUS","N2NCY","LIVEWELL","HOMSTAT","HINSUB"]
mode_values = train_features[categorical_columns].mode().iloc[0,:]
mode_dict = dict(zip(mode_values.index, mode_values))
train_features = train_features.fillna(mode_dict)
Feature Encoding Implementation
Apply binary and one-hot encoding transformations:
# Binary encoding for selected features
encoding_reference = pd.read_excel('insurance_data_dictionary_cleaned.xlsx', sheet_name=2)
binary_encoding_columns = encoding_reference[encoding_reference['transformation']=='binary'].variable_name
binary_features = test_features[binary_encoding_columns]
encoded_binary = OrdinalEncoder().fit_transform(binary_features)
binary_dataframe = pd.DataFrame(
data=encoded_binary,
columns=binary_features.columns,
index=binary_features.index
)
test_features[binary_encoding_columns] = binary_dataframe
Generate dummy variables for categorical features:
# One-hot encoding implementation
onehot_columns = encoding_reference[encoding_reference['transformation']=='dummy'].variable_name
# Process string-type categorical variables
string_dummies = pd.get_dummies(translate_headers(test_features[string_columns]))
# Process non-string categorical variables
nonstring_features = test_features[nonstring_columns].astype(str)
nonstring_dummies = pd.get_dummies(translate_headers(nonstring_features))
# Combine encoded features
remaining_features = translate_headers(test_features.drop(columns=string_columns+nonstring_columns))
final_test_features = pd.concat([remaining_features, string_dummies, nonstring_dummies], axis=1)
Model Development and Evaluatoin
Train initial decision tree classifier with cross-validation:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
classifier = DecisionTreeClassifier(random_state=420, class_weight='balanced')
cross_validation_scores = cross_val_score(classifier, train_features, train_labels)
average_score = cross_validation_scores.mean() # Approximately 0.599
Optimize hyperparameters using grid search:
from sklearn.model_selection import GridSearchCV
parameter_grid = {
'splitter': ('best','random'),
'criterion': ('gini','entropy'),
'max_depth': range(3,15)
}
grid_search = GridSearchCV(
estimator=classifier,
param_grid=parameter_grid,
scoring='roc_auc',
n_jobs=-1,
cv=5,
iid=False,
verbose=2
)
grid_search.fit(train_features, train_labels)
optimal_score = grid_search.best_score_ # Approximately 0.692
optimal_parameters = grid_search.best_params_
# {'criterion': 'entropy', 'max_depth': 6, 'splitter': 'best'}
Evaluate model performance using multiple metrics:
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_curve
predictions = grid_search.predict(test_features)
accuracy = accuracy_score(predictions, test_labels) # ~0.609
precision = precision_score(predictions, test_labels) # ~0.748
recall = recall_score(predictions, test_labels) # ~0.510
false_positive_rate, true_positive_rate, thresholds = roc_curve(predictions, test_labels)
plt.plot(false_positive_rate, true_positive_rate, c='b', label='ROC Curve')
plt.plot(false_positive_rate, false_positive_rate, c='r', ls='--')
Export decision tree visualization:
from sklearn import tree
import graphviz
optimized_classifier = DecisionTreeClassifier(
criterion='entropy',
max_depth=6,
splitter='best'
)
fitted_model = optimized_classifier.fit(train_features, train_labels)
feature_names = train_features.columns
tree_visualization = tree.export_graphviz(
fitted_model,
feature_names=feature_names,
class_names=['Not Purchase','Purchase'],
filled=True,
rounded=True,
leaves_parallel=False
)
graph = graphviz.Source(tree_visualization)