Add py-final.py
This commit is contained in:
460
py-final.py
Normal file
460
py-final.py
Normal file
@ -0,0 +1,460 @@
|
||||
import sys
|
||||
import os
|
||||
import pandas as pd
|
||||
import joblib
|
||||
from PyQt6.QtWidgets import (QApplication, QWidget, QPushButton, QFileDialog, QLabel, QVBoxLayout, QTableWidget, QTableWidgetItem)
|
||||
from PyQt6.QtCore import Qt, QThread, pyqtSignal
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.naive_bayes import CategoricalNB
|
||||
from imblearn.over_sampling import SMOTE
|
||||
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
import getpass
|
||||
|
||||
def combine_excel_files(input_dir, output_dir, combined_file, log_var):
|
||||
# Menggabungkan semua file Excel dalam direktori input menjadi satu file Excel
|
||||
all_data = []
|
||||
|
||||
# Mendapatkan semua file Excel di direktori input
|
||||
excel_files = [f for f in os.listdir(input_dir) if f.endswith(".xlsx")]
|
||||
if not excel_files:
|
||||
log_var.emit("Error: Direktori tidak berisi file Excel apa pun.")
|
||||
return
|
||||
|
||||
# Membuat direktori output jika belum ada
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
output_path = os.path.join(output_dir, combined_file)
|
||||
|
||||
for file in excel_files:
|
||||
file_path = os.path.join(input_dir, file)
|
||||
log_var.emit(f"Menggabungkan file: {file}")
|
||||
xls = pd.ExcelFile(file_path)
|
||||
|
||||
for sheet_name in xls.sheet_names:
|
||||
df = pd.read_excel(xls, sheet_name=sheet_name)
|
||||
df["Playground"] = file.replace(".xlsx", "")
|
||||
|
||||
if 'Position' in df.columns:
|
||||
df["Position"] = df["Position"].str.replace('_', '/', regex=False)
|
||||
|
||||
all_data.append(df)
|
||||
|
||||
# Menggabungkan semua data dari berbagai file Excel
|
||||
combined_df = pd.concat(all_data, ignore_index=True)
|
||||
combined_df.to_excel(output_path, index=False)
|
||||
log_var.emit(f"Dataset yang digabungkan disimpan ke {output_path}")
|
||||
|
||||
def show_columns(file_path, log_var):
|
||||
# Menampilkan kolom yang ada dalam file Excel
|
||||
try:
|
||||
df = pd.read_excel(file_path)
|
||||
log_var.emit(f"Kolom dalam {file_path}: {', '.join(df.columns.tolist())}")
|
||||
except FileNotFoundError:
|
||||
log_var.emit(f"Error: File tidak ditemukan - {file_path}")
|
||||
sys.exit(1)
|
||||
|
||||
def check_required_columns(file_path, required_columns, log_var):
|
||||
# Memeriksa apakah kolom yang diperlukan ada dalam file Excel
|
||||
try:
|
||||
df = pd.read_excel(file_path)
|
||||
missing_columns = [col for col in required_columns if col not in df.columns]
|
||||
if missing_columns:
|
||||
log_var.emit(f"Error: Kolom yang hilang dalam {file_path}: {', '.join(missing_columns)}")
|
||||
sys.exit(1)
|
||||
except FileNotFoundError:
|
||||
log_var.emit(f"Error: File tidak ditemukan - {file_path}")
|
||||
sys.exit(1)
|
||||
|
||||
class ProcessingThread(QThread):
|
||||
update_status = pyqtSignal(str)
|
||||
process_finished = pyqtSignal()
|
||||
classification_report_signal = pyqtSignal(str)
|
||||
distribution_signal = pyqtSignal(str)
|
||||
|
||||
def __init__(self, input_dir, file_pl, file_k, output_dir, combined_file):
|
||||
super().__init__()
|
||||
self.input_dir = input_dir
|
||||
self.file_pl = file_pl
|
||||
self.file_k = file_k
|
||||
self.output_dir = output_dir
|
||||
self.combined_file = combined_file
|
||||
|
||||
def run(self):
|
||||
try:
|
||||
self.update_status.emit("Menggabungkan file Excel...")
|
||||
combine_excel_files(self.input_dir, self.output_dir, self.combined_file, self.update_status)
|
||||
|
||||
self.update_status.emit("Memuat data...")
|
||||
df_cn = pd.read_excel(os.path.join(self.output_dir, self.combined_file))
|
||||
df_pl = pd.read_excel(self.file_pl)
|
||||
df_k = pd.read_excel(self.file_k)
|
||||
|
||||
# Mengubah semua nama kolom menjadi huruf kecil
|
||||
df_cn.columns = df_cn.columns.str.lower()
|
||||
df_pl.columns = df_pl.columns.str.lower()
|
||||
df_k.columns = df_k.columns.str.lower()
|
||||
|
||||
# Mengganti nama kolom untuk konsistensi
|
||||
df_k.rename(columns={'nama universitas': 'university'}, inplace=True)
|
||||
df_pl.rename(columns={'nama posisi di website kampus merdeka': 'position'}, inplace=True)
|
||||
|
||||
# Menghapus spasi di awal dan akhir string dalam kolom tertentu
|
||||
df_cn['university'] = df_cn['university'].str.strip()
|
||||
df_pl['position'] = df_pl['position'].str.strip()
|
||||
df_k['university'] = df_k['university'].str.strip()
|
||||
|
||||
# Menghapus duplikat berdasarkan kolom tertentu
|
||||
df_k = df_k.drop_duplicates(subset=['university'])
|
||||
df_pl = df_pl.drop_duplicates(subset=['position'])
|
||||
|
||||
# Menggabungkan data berdasarkan kolom 'university' dan 'position'
|
||||
df_cn = pd.merge(df_cn,
|
||||
df_k[['university', 'kota / kabupaten kampus', 'provinsi kampus',
|
||||
'preferensi region', 'preferensi area', 'kampus partner',
|
||||
'rank indonesia', 'kategori kampus']],
|
||||
on='university', how='left')
|
||||
|
||||
df_cn = pd.merge(df_cn,
|
||||
df_pl[['position', 'nama kota lengkap', 'provinsi penugasan',
|
||||
'region', 'area', 'jurusan']],
|
||||
on='position', how='left')
|
||||
|
||||
# Mengganti nama kolom untuk kejelasan
|
||||
df_cn.columns = [
|
||||
'cn-id', 'cn-position', 'cn-univ', 'cn-jurusan', 'cn-ipk',
|
||||
'cn-semester', 'cn-tipekampus', 'cn-sertifikat organisasi 1',
|
||||
'cn-sertifikat organisasi 2', 'cn-sertifikat organisasi 3',
|
||||
'cn-sertifikat organisasi 4', 'cn-sertifikat organisasi 5','cn-recstatus',
|
||||
'cn-playground', 'k-kota', 'k-provinsi', 'k-region',
|
||||
'k-area', 'k-partner', 'k-rank', 'k-kategori', 'pl-kota', 'pl-provinsi',
|
||||
'pl-region', 'pl-area', 'pl-jurusan',
|
||||
]
|
||||
|
||||
output_file_path = os.path.join(self.output_dir, "2 - Combined.xlsx")
|
||||
df_cn.to_excel(output_file_path, index=False)
|
||||
|
||||
self.update_status.emit(f"Data dimuat. Total baris: {len(df_cn)}. Memulai penilaian...")
|
||||
|
||||
# Fungsi untuk menilai IPK
|
||||
def score_ipk(cn_ipk):
|
||||
if cn_ipk > 3.75:
|
||||
return 'Perfect'
|
||||
elif cn_ipk > 3.50:
|
||||
return 'Great'
|
||||
elif cn_ipk > 3.25:
|
||||
return 'Good'
|
||||
else:
|
||||
return 'Poor'
|
||||
|
||||
# Fungsi untuk menilai penempatan
|
||||
def score_placement(row):
|
||||
k_kampus = str(row['k-kota'])
|
||||
pl_kota = str(row['pl-kota'])
|
||||
k_region = str(row['k-region'])
|
||||
pl_region = str(row['pl-region'])
|
||||
k_provinsi = str(row['k-provinsi'])
|
||||
pl_provinsi = str(row['pl-provinsi'])
|
||||
k_area = str(row['k-area'])
|
||||
pl_area = str(row['pl-area'])
|
||||
|
||||
if k_kampus == pl_kota:
|
||||
return 'Excellent'
|
||||
elif pl_region in k_region:
|
||||
return 'Perfect'
|
||||
elif k_provinsi == pl_provinsi:
|
||||
return 'Great'
|
||||
elif pl_area in k_area:
|
||||
return 'Good'
|
||||
else:
|
||||
return 'Poor'
|
||||
|
||||
# Fungsi untuk menilai universitas
|
||||
def score_university(k_partner, k_rank, k_kategori):
|
||||
if k_partner == 'Ya' and k_rank in ['Top 10 PTN', 'Top 5 PTS']:
|
||||
return 'Perfect'
|
||||
elif k_partner == 'Ya' and k_rank == 'Regular':
|
||||
return 'Great'
|
||||
elif k_partner == 'Tidak' and k_rank in ['Top 10 PTN', 'Top 5 PTS']:
|
||||
return 'Good'
|
||||
elif k_kategori == 'PTN':
|
||||
return 'Mid'
|
||||
else:
|
||||
return 'Poor'
|
||||
|
||||
# Fungsi untuk menilai organisasi
|
||||
def score_org(row):
|
||||
org_count = sum(not pd.isna(row[col]) for col in ['cn-sertifikat organisasi 1', 'cn-sertifikat organisasi 2',
|
||||
'cn-sertifikat organisasi 3', 'cn-sertifikat organisasi 4',
|
||||
'cn-sertifikat organisasi 5'])
|
||||
if org_count == 0:
|
||||
return 'Poor'
|
||||
elif org_count == 1:
|
||||
return 'Good'
|
||||
else:
|
||||
return 'Great'
|
||||
|
||||
# Fungsi untuk menilai kecocokan jurusan
|
||||
def score_major_match(cn_jurusan, pl_jurusan):
|
||||
if pd.isna(pl_jurusan):
|
||||
return 'Poor'
|
||||
|
||||
pl_jurusan = str(pl_jurusan).strip()
|
||||
cn_jurusan = str(cn_jurusan).strip()
|
||||
|
||||
if pl_jurusan == 'Semua Jurusan':
|
||||
return 'Good'
|
||||
|
||||
pl_jurusan_list = [word.strip().lower() for word in pl_jurusan.split(',')]
|
||||
cn_jurusan_words = cn_jurusan.lower().split()
|
||||
|
||||
if any(cn_word in pl_jurusan_list for cn_word in cn_jurusan_words):
|
||||
return 'Great'
|
||||
|
||||
return 'Poor'
|
||||
|
||||
# Menilai setiap aspek dan menambahkan hasil penilaian ke dataframe
|
||||
self.update_status.emit(f"Mengkategorikan IPK...")
|
||||
df_cn[['ipk_class']] = df_cn['cn-ipk'].apply(lambda x: pd.Series(score_ipk(x)))
|
||||
self.update_status.emit(f"Mengkategorikan Penempatan...")
|
||||
df_cn[['placement_class']] = df_cn.apply(lambda row: pd.Series(score_placement(row)), axis=1)
|
||||
self.update_status.emit(f"Mengkategorikan Universitas...")
|
||||
df_cn[['univ_class']] = df_cn.apply(lambda row: pd.Series(score_university(row['k-partner'], row['k-rank'], row['k-kategori'])), axis=1)
|
||||
self.update_status.emit(f"Mengkategorikan Organisasi...")
|
||||
df_cn[['org_class']] = df_cn.apply(lambda row: pd.Series(score_org(row)), axis=1)
|
||||
self.update_status.emit(f"Mengkategorikan Penilaian Jurusan...")
|
||||
df_cn[['major_match_class']] = df_cn.apply(lambda row: pd.Series(score_major_match(row['cn-jurusan'], row['pl-jurusan'])), axis=1)
|
||||
|
||||
self.update_status.emit(f"Penilaian selesai. Mengekspor...")
|
||||
|
||||
scored_file_path = os.path.join(self.output_dir, "3 - Scored.xlsx")
|
||||
df_cn.to_excel(scored_file_path, index=False)
|
||||
|
||||
user_name = getpass.getuser()
|
||||
|
||||
model_path = f'C:\\Users\\{user_name}\\Documents\\TroopersSelect\\Model\\AITrain.pkl'
|
||||
label_encoders_path = f'C:\\Users\\{user_name}\\Documents\\TroopersSelect\\Model\\label_encoders.pkl'
|
||||
data_path = scored_file_path
|
||||
|
||||
model_dir = os.path.dirname(model_path)
|
||||
if not os.path.exists(model_dir):
|
||||
os.makedirs(model_dir)
|
||||
self.update_status.emit(f"Direktori dibuat: {model_dir}")
|
||||
|
||||
label_encoders = {}
|
||||
if os.path.exists(model_path) and os.path.exists(label_encoders_path):
|
||||
nb_model = joblib.load(model_path)
|
||||
self.update_status.emit("Model pelatihan tersedia. Menggunakan model...")
|
||||
|
||||
label_encoders = joblib.load(label_encoders_path)
|
||||
self.update_status.emit("Label encoders dimuat.")
|
||||
data = pd.read_excel(data_path)
|
||||
|
||||
# Menggunakan label encoders yang dimuat untuk mentransformasi data
|
||||
for column in ['ipk_class', 'placement_class', 'univ_class', 'org_class', 'major_match_class']:
|
||||
le = label_encoders[column] # Memuat encoder yang ada
|
||||
data[column] = le.transform(data[column]) # Transformasi menggunakan encoder yang dimuat
|
||||
|
||||
X = data[['ipk_class', 'placement_class', 'univ_class', 'org_class', 'major_match_class']]
|
||||
y = data['cn-recstatus']
|
||||
|
||||
# Menerapkan SMOTE ke seluruh dataset
|
||||
smote = SMOTE(random_state=42)
|
||||
X_resampled, y_resampled = smote.fit_resample(X, y)
|
||||
|
||||
# Membagi data yang telah di-resample menjadi set pelatihan dan pengujian
|
||||
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)
|
||||
|
||||
# Melatih model
|
||||
nb_model.fit(X_train, y_train)
|
||||
|
||||
# Memprediksi dan mengevaluasi
|
||||
y_pred = nb_model.predict(X_test)
|
||||
report = classification_report(y_test, y_pred)
|
||||
distribution = pd.Series(y_pred).value_counts().to_string()
|
||||
self.classification_report_signal.emit(report)
|
||||
self.distribution_signal.emit(distribution)
|
||||
|
||||
|
||||
# Membuat dataframe baru untuk y_test, y_pred, dan fitur-fitur (X_test)
|
||||
export_df = pd.DataFrame(X_test) # Menambahkan X_test ke dalam dataframe
|
||||
export_df['y_test'] = y_test # Menambahkan y_test ke dalam dataframe
|
||||
export_df['y_pred'] = y_pred # Menambahkan y_pred ke dalam dataframe
|
||||
|
||||
# Menyimpan dataframe ke dalam file Excel
|
||||
final_file_path = os.path.join(self.output_dir, "4 - Final.xlsx")
|
||||
export_df.to_excel(final_file_path, index=False)
|
||||
self.update_status.emit(f"4 - Final.xlsx telah berhasil diekspor ke {final_file_path}")
|
||||
|
||||
|
||||
else:
|
||||
data = pd.read_excel(data_path)
|
||||
|
||||
# Membuat label encoders baru dan melatihnya
|
||||
label_encoders = {}
|
||||
for column in ['ipk_class', 'placement_class', 'univ_class', 'org_class', 'major_match_class']:
|
||||
le = LabelEncoder()
|
||||
data[column] = le.fit_transform(data[column])
|
||||
label_encoders[column] = le # Menyimpan encoder untuk penggunaan di masa depan
|
||||
|
||||
X = data[['ipk_class', 'placement_class', 'univ_class', 'org_class', 'major_match_class']]
|
||||
y = data['cn-recstatus']
|
||||
|
||||
# Menerapkan SMOTE ke seluruh dataset
|
||||
smote = SMOTE(random_state=42)
|
||||
X_resampled, y_resampled = smote.fit_resample(X, y)
|
||||
|
||||
# Membagi data yang telah di-resample menjadi set pelatihan dan pengujian
|
||||
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)
|
||||
|
||||
# Melatih model
|
||||
nb_model = CategoricalNB()
|
||||
nb_model.fit(X_train, y_train)
|
||||
|
||||
# Memprediksi dan mengevaluasi
|
||||
y_pred = nb_model.predict(X_test)
|
||||
self.update_status.emit(f"Akurasi: {accuracy_score(y_test, y_pred)}")
|
||||
report = classification_report(y_test, y_pred)
|
||||
distribution = pd.Series(y_pred).value_counts().to_string()
|
||||
self.classification_report_signal.emit(report)
|
||||
self.distribution_signal.emit(distribution)
|
||||
|
||||
# Membuat dataframe baru untuk y_test, y_pred, dan fitur-fitur (X_test)
|
||||
export_df = pd.DataFrame(X_test) # Menambahkan X_test ke dalam dataframe
|
||||
export_df['y_test'] = y_test # Menambahkan y_test ke dalam dataframe
|
||||
export_df['y_pred'] = y_pred # Menambahkan y_pred ke dalam dataframe
|
||||
|
||||
# Menyimpan dataframe ke dalam file Excel
|
||||
final_file_path = os.path.join(self.output_dir, "4 - Final.xlsx")
|
||||
export_df.to_excel(final_file_path, index=False)
|
||||
self.update_status.emit(f"4 - Final.xlsx telah berhasil diekspor ke {final_file_path}")
|
||||
|
||||
# Menyimpan model dan label encoders
|
||||
try:
|
||||
joblib.dump(nb_model, model_path)
|
||||
self.update_status.emit(f"Model disimpan ke {model_path}")
|
||||
joblib.dump(label_encoders, label_encoders_path)
|
||||
self.update_status.emit(f"Label encoders disimpan ke {label_encoders_path}")
|
||||
except Exception as e:
|
||||
self.update_status.emit(f"Error menyimpan label encoders: {str(e)}")
|
||||
self.process_finished.emit()
|
||||
except Exception as e:
|
||||
self.update_status.emit(f"Error: {str(e)}")
|
||||
self.process_finished.emit()
|
||||
|
||||
|
||||
class ResultWindow(QWidget):
|
||||
def __init__(self, classification_report, distribution):
|
||||
super().__init__()
|
||||
self.setWindowTitle("Classification Report and Distribution")
|
||||
self.setGeometry(100, 100, 600, 400) # Ukuran disesuaikan
|
||||
self.setFixedSize(500, 300)
|
||||
|
||||
layout = QVBoxLayout()
|
||||
|
||||
self.table = QTableWidget()
|
||||
self.table.setColumnCount(2)
|
||||
self.table.setHorizontalHeaderLabels(["Classification Report", "Distribution"])
|
||||
|
||||
report_lines = classification_report.split('\n')
|
||||
distribution_lines = distribution.split('\n')
|
||||
|
||||
max_rows = max(len(report_lines), len(distribution_lines))
|
||||
self.table.setRowCount(max_rows)
|
||||
|
||||
# Mengatur lebar kolom
|
||||
self.table.setColumnWidth(0, 300) # Lebar untuk laporan klasifikasi
|
||||
self.table.setColumnWidth(1, 130) # Lebar untuk distribusi
|
||||
|
||||
for i, line in enumerate(report_lines):
|
||||
item = QTableWidgetItem(line)
|
||||
item.setFlags(item.flags() & ~Qt.ItemFlag.ItemIsEditable) # Membuat teks tidak dapat diedit
|
||||
self.table.setItem(i, 0, item)
|
||||
|
||||
for i, line in enumerate(distribution_lines):
|
||||
item = QTableWidgetItem(line)
|
||||
item.setFlags(item.flags() & ~Qt.ItemFlag.ItemIsEditable) # Membuat teks tidak dapat diedit
|
||||
self.table.setItem(i, 1, item)
|
||||
|
||||
layout.addWidget(self.table)
|
||||
self.setLayout(layout)
|
||||
|
||||
class MainWindow(QWidget):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.initUI()
|
||||
self.script_dir = os.path.dirname(__file__)
|
||||
self.file_pl = os.path.join(self.script_dir, "pl.xlsx")
|
||||
self.file_k = os.path.join(self.script_dir, "k.xlsx")
|
||||
self.label_status.setText(f"Data Playground Dipilih: {self.file_pl}\nData Kampus Dipilih: {self.file_k}")
|
||||
# Inisialisasi variabel untuk menghindari UnboundLocalError
|
||||
self.classification_report = "" # Nilai default
|
||||
self.distribution = "" # Nilai default
|
||||
|
||||
def initUI(self):
|
||||
self.setWindowTitle("TroopersSelect Data Processor")
|
||||
self.setGeometry(100, 100, 600, 300)
|
||||
self.setFixedSize(600, 300)
|
||||
|
||||
layout = QVBoxLayout()
|
||||
|
||||
self.label_status = QLabel("Pilih file dan folder untuk memulai.")
|
||||
self.label_status.setAlignment(Qt.AlignmentFlag.AlignCenter)
|
||||
|
||||
self.btn_select_folder = QPushButton("Pilih Folder Data")
|
||||
self.btn_execute = QPushButton("Eksekusi")
|
||||
|
||||
self.btn_select_folder.clicked.connect(self.select_folder)
|
||||
self.btn_execute.clicked.connect(self.execute_process)
|
||||
|
||||
layout.addWidget(self.label_status)
|
||||
layout.addWidget(self.btn_select_folder)
|
||||
layout.addWidget(self.btn_execute)
|
||||
|
||||
self.setLayout(layout)
|
||||
|
||||
self.input_dir = ""
|
||||
|
||||
def select_folder(self):
|
||||
# Memilih folder yang berisi data
|
||||
folder = QFileDialog.getExistingDirectory(self, "Pilih Folder Data")
|
||||
if folder:
|
||||
self.input_dir = folder
|
||||
self.label_status.setText(f"Folder Dipilih: {folder}\nData Playground Dipilih: {self.file_pl}\nData Kampus Dipilih: {self.file_k}")
|
||||
|
||||
def execute_process(self):
|
||||
# Mengeksekusi proses pengolahan data
|
||||
if not self.input_dir:
|
||||
self.label_status.setText("Error: Silakan pilih folder data.")
|
||||
return
|
||||
|
||||
self.label_status.setText("Memulai proses...")
|
||||
home_dir = os.path.expanduser("~")
|
||||
output_dir = os.path.join(home_dir, "Documents", "TroopersSelect", "Output")
|
||||
combined_file = "1 - KM-PL Combined.xlsx"
|
||||
self.processing_thread = ProcessingThread(self.input_dir, self.file_pl, self.file_k, output_dir, combined_file)
|
||||
self.processing_thread.update_status.connect(self.label_status.setText)
|
||||
self.processing_thread.process_finished.connect(self.show_result_window)
|
||||
self.processing_thread.classification_report_signal.connect(self.set_classification_report)
|
||||
self.processing_thread.distribution_signal.connect(self.set_distribution)
|
||||
self.btn_execute.setEnabled(False)
|
||||
self.processing_thread.start()
|
||||
|
||||
def set_classification_report(self, report):
|
||||
# Menyimpan laporan klasifikasi
|
||||
self.classification_report = report
|
||||
|
||||
def set_distribution(self, distribution):
|
||||
# Menyimpan distribusi prediksi
|
||||
self.distribution = distribution
|
||||
|
||||
def show_result_window(self):
|
||||
# Menampilkan jendela hasil
|
||||
self.result_window = ResultWindow(self.classification_report, self.distribution)
|
||||
self.result_window.show()
|
||||
self.btn_execute.setEnabled(True)
|
||||
|
||||
if __name__ == "__main__":
|
||||
app = QApplication(sys.argv)
|
||||
window = MainWindow()
|
||||
window.show()
|
||||
sys.exit(app.exec())
|
Reference in New Issue
Block a user