-
Notifications
You must be signed in to change notification settings - Fork 0
/
LearningToRank_FeatureImportance.py
83 lines (67 loc) · 2.91 KB
/
LearningToRank_FeatureImportance.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import xgboost as xgb
from xgboost import plot_importance
classDataLoader:
def__init__(self, data_file_path, column_file_path):
self.data_file_path = data_file_path
self.column_file_path = column_file_path
defload_data(self):
# Load the dataset
df = pd.read_csv(self.data_file_path)
column_names = pd.read_csv(self.column_file_path, header=None, squeeze=True)
# Remove the last column name if there are 137 names in the listiflen(column_names) == 137:
column_names = column_names[:-1]
# Add the column names to the dataframe starting from column 2
df.columns.values[2:138] = column_names
return df
classDataAnalyzer:
@staticmethoddefanalyze_data(df):
print(f"Shape of DataFrame: {df.shape}")
print(df.head(10))
unique_scores = df['0'].unique()
print("Unique Relevance Scores:", unique_scores)
freq = df['0'].value_counts()
DataAnalyzer.plot_distribution(freq)
@staticmethoddefplot_distribution(freq):
plt.bar(freq.index, freq.values)
plt.title("Relevance Score Distribution")
plt.xlabel('Relevance Score')
plt.ylabel('Frequency')
plt.show()
classModelTrainer:
def__init__(self):
self.model = xgb.XGBClassifier()
deftrain_model(self, X_train, y_train):
self.model.fit(X_train, y_train)
deffeature_importance(self, feature_names):
self.model.get_booster().feature_names = feature_names
plot_importance(self.model.get_booster(), max_num_features=10)
plt.show()
classDataPreprocessor:
@staticmethoddefsplit_data(df, test_size=0.3, random_state=42):
X = df.iloc[:, 2:]
y = df.iloc[:, 0]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of X_test: {X_test.shape}")
print(f"Shape of y_test: {y_test.shape}")
return X_train, X_test, y_train, y_test
if __name__ == "__main__":
# File paths
data_file_path = '/Users/atifsiddiqui/Documents/SPRING2023/IR/fold1_train_sample_all_queries.csv'
column_file_path = '/Users/atifsiddiqui/Documents/SPRING2023/IR/file.csv'# Load and prepare data
data_loader = DataLoader(data_file_path, column_file_path)
df = data_loader.load_data()
# Analyze data
DataAnalyzer.analyze_data(df)
# Preprocess data
X_train, X_test, y_train, y_test = DataPreprocessor.split_data(df)
# Train model and display feature importance
model_trainer = ModelTrainer()
model_trainer.train_model(X_train, y_train)
model_trainer.feature_importance(list(df.columns)[2:])