Fading Coder

One Final Commit for the Last Sprint

Home > Tech > Content

Data Processing and Model Training Pipeline for Deep Learning Applications

Tech 1

Data Preparation

Begin by creating a duplicate of the original dataset to prevent contamination. Identify missing values using visualizations like heatmaps, and remove redundant fields.

import numpy as np
import pandas as pd

# Find symmetric difference between two lists
list_a = ["tom","name",3]
list_b = [7,"name2",3]
print(np.setxor1d(list_a,list_b)) # ['7' 'name' 'name2' 'tom']

# Transform column names using mapping dictionary
col_mapping = {'NY8Y9': 'N1819', 'N2N29': 'N2029', 'N3N39': 'N3039', 
               'N4N49': 'N4049', 'N5N59': 'N5059', 'N6N64': 'N6064'}

def transform_columns(col_name):
    return col_mapping.get(col_name, col_name)

dataset.columns = dataset.columns.map(transform_columns)

Translate English column headers to Chinese equivalents:

# Create translation dictionary from feature metadata
translation_map = dict(feature_metadata[['english_name','chinese_name']].values)

def translate_headers(df):
    df_copy = df.copy()
    df_copy.columns = pd.Series(df_copy.columns).map(translation_map)
    return df_copy

# Apply translation to first 5 columns
translated_data = translate_headers(dataset[feature_metadata.english_name[:5].tolist()])

Examine field characteristics including data types, value distributions, and missing value counts:

def analyze_fields(dataframe):
    for column in dataframe.columns:
        print("Field name:", column)
        print("Data type:", dataframe[column].dtype)
        print("Value distribution:")
        print(dataframe[column].value_counts())
        print("Missing values:", dataframe[column].isnull().sum())
        print("---")

Visualize categorical distributions and continuous variable deensity plots:

import seaborn as sns
import matplotlib.pyplot as plt

# Configure Chinese font support
sns.set_style("darkgrid", {"font.sans-serif": ['simhei','Droid Sans Fallback']})
plt.figure(figsize=(6,2))
sns.countplot(y='purchase_indicator', data=translated_data)
plt.show()

# Density plot by age groups
sns.kdeplot(translated_data.age[translated_data.purchase_indicator==1], label='Purchased')
sns.kdeplot(translated_data.age[translated_data.purchase_indicator==0], label='Not Purchased')
sns.kdeplot(translated_data.age.dropna(), label='All Records')
plt.xlim([60,90])
plt.xlabel('Age')
plt.ylabel('Density')

Data Cleaning Operations

Convert categorical variables to numerical representations:

def encode_categorical_features(df):
    df_encoded = df.copy()
    for column in df_encoded.columns:
        if df_encoded[column].dtype == 'object':
            unique_values = list(df_encoded[column].value_counts().index)
            encoding_dict = dict(zip(unique_values, range(len(unique_values))))
            df_encoded[column] = df_encoded[column].map(encoding_dict)
    return df_encoded

# Alternative method using scikit-learn
from sklearn.preprocessing import OrdinalEncoder
encoded_array = OrdinalEncoder().fit_transform(categorical_features)

Identify highly correlated features for potential removal:

def find_highly_correlated_features(dataframe, threshold=0.65):
    correlation_matrix = dataframe.corr().abs()
    high_corr_columns = []
    
    for column in correlation_matrix.columns:
        if (correlation_matrix[column] > threshold).sum() >= 2:
            high_corr_columns.append(column)
    
    return high_corr_columns

# Visualize correlations
sns.heatmap(encoded_data.corr(), cmap='Blues')

Remove specified columns and duplicate records:

columns_to_remove = ["KBM_INDV_ID","U18","POEP","AART","AHCH","AASN","COLLEGE",
                     "INVE","c210cip","c210hmi","c210hva","c210kses","c210blu",
                     "c210bpvt","c210poo","KBM_INDV_ID","meda"]

cleaned_dataset = dataset.drop(columns=columns_to_remove)
cleaned_dataset = cleaned_dataset.drop_duplicates()

Dataset Partitioning

Split data into training and testing subsets before performing imputation or encoding:

from sklearn.model_selection import train_test_split

target_variable = cleaned_dataset.pop('response_flag')
feature_variables = cleaned_dataset

X_train, X_test, y_train, y_test = train_test_split(
    feature_variables, target_variable, test_size=0.3, random_state=100
)

# Create working copies
train_features = X_train.copy()
test_features = X_test.copy()
train_labels = y_train.copy()
test_labels = y_test.copy()

Handle mising values by filling with median and mode statistics:

# Fill numeric columns with median values
numeric_columns = ["age","c210mah","c210b200","c210psu","c210wht","ilor"]
median_values = train_features[numeric_columns].median()
median_dict = dict(zip(median_values.index, median_values))
train_features = train_features.fillna(median_dict)

# Fill categorical columns with mode values
categorical_columns = ["N1819","ASKN","MOBPLUS","N2NCY","LIVEWELL","HOMSTAT","HINSUB"]
mode_values = train_features[categorical_columns].mode().iloc[0,:]
mode_dict = dict(zip(mode_values.index, mode_values))
train_features = train_features.fillna(mode_dict)

Feature Encoding Implementation

Apply binary and one-hot encoding transformations:

# Binary encoding for selected features
encoding_reference = pd.read_excel('insurance_data_dictionary_cleaned.xlsx', sheet_name=2)
binary_encoding_columns = encoding_reference[encoding_reference['transformation']=='binary'].variable_name

binary_features = test_features[binary_encoding_columns]
encoded_binary = OrdinalEncoder().fit_transform(binary_features)

binary_dataframe = pd.DataFrame(
    data=encoded_binary, 
    columns=binary_features.columns, 
    index=binary_features.index
)
test_features[binary_encoding_columns] = binary_dataframe

Generate dummy variables for categorical features:

# One-hot encoding implementation
onehot_columns = encoding_reference[encoding_reference['transformation']=='dummy'].variable_name

# Process string-type categorical variables
string_dummies = pd.get_dummies(translate_headers(test_features[string_columns]))

# Process non-string categorical variables
nonstring_features = test_features[nonstring_columns].astype(str)
nonstring_dummies = pd.get_dummies(translate_headers(nonstring_features))

# Combine encoded features
remaining_features = translate_headers(test_features.drop(columns=string_columns+nonstring_columns))
final_test_features = pd.concat([remaining_features, string_dummies, nonstring_dummies], axis=1)

Model Development and Evaluatoin

Train initial decision tree classifier with cross-validation:

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

classifier = DecisionTreeClassifier(random_state=420, class_weight='balanced')
cross_validation_scores = cross_val_score(classifier, train_features, train_labels)
average_score = cross_validation_scores.mean()  # Approximately 0.599

Optimize hyperparameters using grid search:

from sklearn.model_selection import GridSearchCV

parameter_grid = {
    'splitter': ('best','random'),
    'criterion': ('gini','entropy'),
    'max_depth': range(3,15)
}

grid_search = GridSearchCV(
    estimator=classifier,
    param_grid=parameter_grid,
    scoring='roc_auc',
    n_jobs=-1,
    cv=5,
    iid=False,
    verbose=2
)

grid_search.fit(train_features, train_labels)
optimal_score = grid_search.best_score_  # Approximately 0.692
optimal_parameters = grid_search.best_params_
# {'criterion': 'entropy', 'max_depth': 6, 'splitter': 'best'}

Evaluate model performance using multiple metrics:

from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_curve

predictions = grid_search.predict(test_features)
accuracy = accuracy_score(predictions, test_labels)      # ~0.609
precision = precision_score(predictions, test_labels)    # ~0.748
recall = recall_score(predictions, test_labels)          # ~0.510

false_positive_rate, true_positive_rate, thresholds = roc_curve(predictions, test_labels)
plt.plot(false_positive_rate, true_positive_rate, c='b', label='ROC Curve')
plt.plot(false_positive_rate, false_positive_rate, c='r', ls='--')

Export decision tree visualization:

from sklearn import tree
import graphviz

optimized_classifier = DecisionTreeClassifier(
    criterion='entropy', 
    max_depth=6, 
    splitter='best'
)

fitted_model = optimized_classifier.fit(train_features, train_labels)
feature_names = train_features.columns

tree_visualization = tree.export_graphviz(
    fitted_model,
    feature_names=feature_names,
    class_names=['Not Purchase','Purchase'],
    filled=True,
    rounded=True,
    leaves_parallel=False
)

graph = graphviz.Source(tree_visualization)

Related Articles

Understanding Strong and Weak References in Java

Strong References Strong reference are the most prevalent type of object referencing in Java. When an object has a strong reference pointing to it, the garbage collector will not reclaim its memory. F...

Comprehensive Guide to SSTI Explained with Payload Bypass Techniques

Introduction Server-Side Template Injection (SSTI) is a vulnerability in web applications where user input is improper handled within the template engine and executed on the server. This exploit can r...

Implement Image Upload Functionality for Django Integrated TinyMCE Editor

Django’s Admin panel is highly user-friendly, and pairing it with TinyMCE, an effective rich text editor, simplifies content management significantly. Combining the two is particular useful for bloggi...

Leave a Comment

Anonymous

◎Feel free to join the discussion and share your thoughts.