import sys import os import pandas as pd import joblib from PyQt6.QtWidgets import (QApplication, QWidget, QPushButton, QFileDialog, QLabel, QVBoxLayout, QTableWidget, QTableWidgetItem) from PyQt6.QtCore import Qt, QThread, pyqtSignal from sklearn.model_selection import train_test_split from sklearn.naive_bayes import CategoricalNB from imblearn.over_sampling import SMOTE from sklearn.metrics import accuracy_score, classification_report, confusion_matrix from sklearn.preprocessing import LabelEncoder import getpass def combine_excel_files(input_dir, output_dir, combined_file, log_var): # Menggabungkan semua file Excel dalam direktori input menjadi satu file Excel all_data = [] # Mendapatkan semua file Excel di direktori input excel_files = [f for f in os.listdir(input_dir) if f.endswith(".xlsx")] if not excel_files: log_var.emit("Error: Direktori tidak berisi file Excel apa pun.") return # Membuat direktori output jika belum ada os.makedirs(output_dir, exist_ok=True) output_path = os.path.join(output_dir, combined_file) for file in excel_files: file_path = os.path.join(input_dir, file) log_var.emit(f"Menggabungkan file: {file}") xls = pd.ExcelFile(file_path) for sheet_name in xls.sheet_names: df = pd.read_excel(xls, sheet_name=sheet_name) df["Playground"] = file.replace(".xlsx", "") if 'Position' in df.columns: df["Position"] = df["Position"].str.replace('_', '/', regex=False) all_data.append(df) # Menggabungkan semua data dari berbagai file Excel combined_df = pd.concat(all_data, ignore_index=True) combined_df.to_excel(output_path, index=False) log_var.emit(f"Dataset yang digabungkan disimpan ke {output_path}") def show_columns(file_path, log_var): # Menampilkan kolom yang ada dalam file Excel try: df = pd.read_excel(file_path) log_var.emit(f"Kolom dalam {file_path}: {', '.join(df.columns.tolist())}") except FileNotFoundError: log_var.emit(f"Error: File tidak ditemukan - {file_path}") sys.exit(1) def check_required_columns(file_path, required_columns, log_var): # Memeriksa apakah kolom yang diperlukan ada dalam file Excel try: df = pd.read_excel(file_path) missing_columns = [col for col in required_columns if col not in df.columns] if missing_columns: log_var.emit(f"Error: Kolom yang hilang dalam {file_path}: {', '.join(missing_columns)}") sys.exit(1) except FileNotFoundError: log_var.emit(f"Error: File tidak ditemukan - {file_path}") sys.exit(1) class ProcessingThread(QThread): update_status = pyqtSignal(str) process_finished = pyqtSignal() classification_report_signal = pyqtSignal(str) distribution_signal = pyqtSignal(str) def __init__(self, input_dir, file_pl, file_k, output_dir, combined_file): super().__init__() self.input_dir = input_dir self.file_pl = file_pl self.file_k = file_k self.output_dir = output_dir self.combined_file = combined_file def run(self): try: self.update_status.emit("Menggabungkan file Excel...") combine_excel_files(self.input_dir, self.output_dir, self.combined_file, self.update_status) self.update_status.emit("Memuat data...") df_cn = pd.read_excel(os.path.join(self.output_dir, self.combined_file)) df_pl = pd.read_excel(self.file_pl) df_k = pd.read_excel(self.file_k) # Mengubah semua nama kolom menjadi huruf kecil df_cn.columns = df_cn.columns.str.lower() df_pl.columns = df_pl.columns.str.lower() df_k.columns = df_k.columns.str.lower() # Mengganti nama kolom untuk konsistensi df_k.rename(columns={'nama universitas': 'university'}, inplace=True) df_pl.rename(columns={'nama posisi di website kampus merdeka': 'position'}, inplace=True) # Menghapus spasi di awal dan akhir string dalam kolom tertentu df_cn['university'] = df_cn['university'].str.strip() df_pl['position'] = df_pl['position'].str.strip() df_k['university'] = df_k['university'].str.strip() # Menghapus duplikat berdasarkan kolom tertentu df_k = df_k.drop_duplicates(subset=['university']) df_pl = df_pl.drop_duplicates(subset=['position']) # Menggabungkan data berdasarkan kolom 'university' dan 'position' df_cn = pd.merge(df_cn, df_k[['university', 'kota / kabupaten kampus', 'provinsi kampus', 'preferensi region', 'preferensi area', 'kampus partner', 'rank indonesia', 'kategori kampus']], on='university', how='left') df_cn = pd.merge(df_cn, df_pl[['position', 'nama kota lengkap', 'provinsi penugasan', 'region', 'area', 'jurusan']], on='position', how='left') # Mengganti nama kolom untuk kejelasan df_cn.columns = [ 'cn-id', 'cn-position', 'cn-univ', 'cn-jurusan', 'cn-ipk', 'cn-semester', 'cn-tipekampus', 'cn-sertifikat organisasi 1', 'cn-sertifikat organisasi 2', 'cn-sertifikat organisasi 3', 'cn-sertifikat organisasi 4', 'cn-sertifikat organisasi 5','cn-recstatus', 'cn-playground', 'k-kota', 'k-provinsi', 'k-region', 'k-area', 'k-partner', 'k-rank', 'k-kategori', 'pl-kota', 'pl-provinsi', 'pl-region', 'pl-area', 'pl-jurusan', ] output_file_path = os.path.join(self.output_dir, "2 - Combined.xlsx") df_cn.to_excel(output_file_path, index=False) self.update_status.emit(f"Data dimuat. Total baris: {len(df_cn)}. Memulai penilaian...") # Fungsi untuk menilai IPK def score_ipk(cn_ipk): if cn_ipk > 3.75: return 'Perfect' elif cn_ipk > 3.50: return 'Great' elif cn_ipk > 3.25: return 'Good' else: return 'Poor' # Fungsi untuk menilai penempatan def score_placement(row): k_kampus = str(row['k-kota']) pl_kota = str(row['pl-kota']) k_region = str(row['k-region']) pl_region = str(row['pl-region']) k_provinsi = str(row['k-provinsi']) pl_provinsi = str(row['pl-provinsi']) k_area = str(row['k-area']) pl_area = str(row['pl-area']) if k_kampus == pl_kota: return 'Excellent' elif pl_region in k_region: return 'Perfect' elif k_provinsi == pl_provinsi: return 'Great' elif pl_area in k_area: return 'Good' else: return 'Poor' # Fungsi untuk menilai universitas def score_university(k_partner, k_rank, k_kategori): if k_partner == 'Ya' and k_rank in ['Top 10 PTN', 'Top 5 PTS']: return 'Perfect' elif k_partner == 'Ya' and k_rank == 'Regular': return 'Great' elif k_partner == 'Tidak' and k_rank in ['Top 10 PTN', 'Top 5 PTS']: return 'Good' elif k_kategori == 'PTN': return 'Mid' else: return 'Poor' # Fungsi untuk menilai organisasi def score_org(row): org_count = sum(not pd.isna(row[col]) for col in ['cn-sertifikat organisasi 1', 'cn-sertifikat organisasi 2', 'cn-sertifikat organisasi 3', 'cn-sertifikat organisasi 4', 'cn-sertifikat organisasi 5']) if org_count == 0: return 'Poor' elif org_count == 1: return 'Good' else: return 'Great' # Fungsi untuk menilai kecocokan jurusan def score_major_match(cn_jurusan, pl_jurusan): if pd.isna(pl_jurusan): return 'Poor' pl_jurusan = str(pl_jurusan).strip() cn_jurusan = str(cn_jurusan).strip() if pl_jurusan == 'Semua Jurusan': return 'Good' pl_jurusan_list = [word.strip().lower() for word in pl_jurusan.split(',')] cn_jurusan_words = cn_jurusan.lower().split() if any(cn_word in pl_jurusan_list for cn_word in cn_jurusan_words): return 'Great' return 'Poor' # Menilai setiap aspek dan menambahkan hasil penilaian ke dataframe self.update_status.emit(f"Mengkategorikan IPK...") df_cn[['ipk_class']] = df_cn['cn-ipk'].apply(lambda x: pd.Series(score_ipk(x))) self.update_status.emit(f"Mengkategorikan Penempatan...") df_cn[['placement_class']] = df_cn.apply(lambda row: pd.Series(score_placement(row)), axis=1) self.update_status.emit(f"Mengkategorikan Universitas...") df_cn[['univ_class']] = df_cn.apply(lambda row: pd.Series(score_university(row['k-partner'], row['k-rank'], row['k-kategori'])), axis=1) self.update_status.emit(f"Mengkategorikan Organisasi...") df_cn[['org_class']] = df_cn.apply(lambda row: pd.Series(score_org(row)), axis=1) self.update_status.emit(f"Mengkategorikan Penilaian Jurusan...") df_cn[['major_match_class']] = df_cn.apply(lambda row: pd.Series(score_major_match(row['cn-jurusan'], row['pl-jurusan'])), axis=1) self.update_status.emit(f"Penilaian selesai. Mengekspor...") scored_file_path = os.path.join(self.output_dir, "3 - Scored.xlsx") df_cn.to_excel(scored_file_path, index=False) user_name = getpass.getuser() model_path = f'C:\\Users\\{user_name}\\Documents\\TroopersSelect\\Model\\AITrain.pkl' label_encoders_path = f'C:\\Users\\{user_name}\\Documents\\TroopersSelect\\Model\\label_encoders.pkl' data_path = scored_file_path model_dir = os.path.dirname(model_path) if not os.path.exists(model_dir): os.makedirs(model_dir) self.update_status.emit(f"Direktori dibuat: {model_dir}") label_encoders = {} if os.path.exists(model_path) and os.path.exists(label_encoders_path): nb_model = joblib.load(model_path) self.update_status.emit("Model pelatihan tersedia. Menggunakan model...") label_encoders = joblib.load(label_encoders_path) self.update_status.emit("Label encoders dimuat.") data = pd.read_excel(data_path) # Menggunakan label encoders yang dimuat untuk mentransformasi data for column in ['ipk_class', 'placement_class', 'univ_class', 'org_class', 'major_match_class']: le = label_encoders[column] # Memuat encoder yang ada data[column] = le.transform(data[column]) # Transformasi menggunakan encoder yang dimuat X = data[['ipk_class', 'placement_class', 'univ_class', 'org_class', 'major_match_class']] y = data['cn-recstatus'] # Menerapkan SMOTE ke seluruh dataset smote = SMOTE(random_state=42) X_resampled, y_resampled = smote.fit_resample(X, y) # Membagi data yang telah di-resample menjadi set pelatihan dan pengujian X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42) # Melatih model nb_model.fit(X_train, y_train) # Memprediksi dan mengevaluasi y_pred = nb_model.predict(X_test) report = classification_report(y_test, y_pred) distribution = pd.Series(y_pred).value_counts().to_string() self.classification_report_signal.emit(report) self.distribution_signal.emit(distribution) # Membuat dataframe baru untuk y_test, y_pred, dan fitur-fitur (X_test) export_df = pd.DataFrame(X_test) # Menambahkan X_test ke dalam dataframe export_df['y_test'] = y_test # Menambahkan y_test ke dalam dataframe export_df['y_pred'] = y_pred # Menambahkan y_pred ke dalam dataframe # Menyimpan dataframe ke dalam file Excel final_file_path = os.path.join(self.output_dir, "4 - Final.xlsx") export_df.to_excel(final_file_path, index=False) self.update_status.emit(f"4 - Final.xlsx telah berhasil diekspor ke {final_file_path}") else: data = pd.read_excel(data_path) # Membuat label encoders baru dan melatihnya label_encoders = {} for column in ['ipk_class', 'placement_class', 'univ_class', 'org_class', 'major_match_class']: le = LabelEncoder() data[column] = le.fit_transform(data[column]) label_encoders[column] = le # Menyimpan encoder untuk penggunaan di masa depan X = data[['ipk_class', 'placement_class', 'univ_class', 'org_class', 'major_match_class']] y = data['cn-recstatus'] # Menerapkan SMOTE ke seluruh dataset smote = SMOTE(random_state=42) X_resampled, y_resampled = smote.fit_resample(X, y) # Membagi data yang telah di-resample menjadi set pelatihan dan pengujian X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42) # Melatih model nb_model = CategoricalNB() nb_model.fit(X_train, y_train) # Memprediksi dan mengevaluasi y_pred = nb_model.predict(X_test) self.update_status.emit(f"Akurasi: {accuracy_score(y_test, y_pred)}") report = classification_report(y_test, y_pred) distribution = pd.Series(y_pred).value_counts().to_string() self.classification_report_signal.emit(report) self.distribution_signal.emit(distribution) # Membuat dataframe baru untuk y_test, y_pred, dan fitur-fitur (X_test) export_df = pd.DataFrame(X_test) # Menambahkan X_test ke dalam dataframe export_df['y_test'] = y_test # Menambahkan y_test ke dalam dataframe export_df['y_pred'] = y_pred # Menambahkan y_pred ke dalam dataframe # Menyimpan dataframe ke dalam file Excel final_file_path = os.path.join(self.output_dir, "4 - Final.xlsx") export_df.to_excel(final_file_path, index=False) self.update_status.emit(f"4 - Final.xlsx telah berhasil diekspor ke {final_file_path}") # Menyimpan model dan label encoders try: joblib.dump(nb_model, model_path) self.update_status.emit(f"Model disimpan ke {model_path}") joblib.dump(label_encoders, label_encoders_path) self.update_status.emit(f"Label encoders disimpan ke {label_encoders_path}") except Exception as e: self.update_status.emit(f"Error menyimpan label encoders: {str(e)}") self.process_finished.emit() except Exception as e: self.update_status.emit(f"Error: {str(e)}") self.process_finished.emit() class ResultWindow(QWidget): def __init__(self, classification_report, distribution): super().__init__() self.setWindowTitle("Classification Report and Distribution") self.setGeometry(100, 100, 600, 400) # Ukuran disesuaikan self.setFixedSize(500, 300) layout = QVBoxLayout() self.table = QTableWidget() self.table.setColumnCount(2) self.table.setHorizontalHeaderLabels(["Classification Report", "Distribution"]) report_lines = classification_report.split('\n') distribution_lines = distribution.split('\n') max_rows = max(len(report_lines), len(distribution_lines)) self.table.setRowCount(max_rows) # Mengatur lebar kolom self.table.setColumnWidth(0, 300) # Lebar untuk laporan klasifikasi self.table.setColumnWidth(1, 130) # Lebar untuk distribusi for i, line in enumerate(report_lines): item = QTableWidgetItem(line) item.setFlags(item.flags() & ~Qt.ItemFlag.ItemIsEditable) # Membuat teks tidak dapat diedit self.table.setItem(i, 0, item) for i, line in enumerate(distribution_lines): item = QTableWidgetItem(line) item.setFlags(item.flags() & ~Qt.ItemFlag.ItemIsEditable) # Membuat teks tidak dapat diedit self.table.setItem(i, 1, item) layout.addWidget(self.table) self.setLayout(layout) class MainWindow(QWidget): def __init__(self): super().__init__() self.initUI() self.script_dir = os.path.dirname(__file__) self.file_pl = os.path.join(self.script_dir, "pl.xlsx") self.file_k = os.path.join(self.script_dir, "k.xlsx") self.label_status.setText(f"Data Playground Dipilih: {self.file_pl}\nData Kampus Dipilih: {self.file_k}") # Inisialisasi variabel untuk menghindari UnboundLocalError self.classification_report = "" # Nilai default self.distribution = "" # Nilai default def initUI(self): self.setWindowTitle("TroopersSelect Data Processor") self.setGeometry(100, 100, 600, 300) self.setFixedSize(600, 300) layout = QVBoxLayout() self.label_status = QLabel("Pilih file dan folder untuk memulai.") self.label_status.setAlignment(Qt.AlignmentFlag.AlignCenter) self.btn_select_folder = QPushButton("Pilih Folder Data") self.btn_execute = QPushButton("Eksekusi") self.btn_select_folder.clicked.connect(self.select_folder) self.btn_execute.clicked.connect(self.execute_process) layout.addWidget(self.label_status) layout.addWidget(self.btn_select_folder) layout.addWidget(self.btn_execute) self.setLayout(layout) self.input_dir = "" def select_folder(self): # Memilih folder yang berisi data folder = QFileDialog.getExistingDirectory(self, "Pilih Folder Data") if folder: self.input_dir = folder self.label_status.setText(f"Folder Dipilih: {folder}\nData Playground Dipilih: {self.file_pl}\nData Kampus Dipilih: {self.file_k}") def execute_process(self): # Mengeksekusi proses pengolahan data if not self.input_dir: self.label_status.setText("Error: Silakan pilih folder data.") return self.label_status.setText("Memulai proses...") home_dir = os.path.expanduser("~") output_dir = os.path.join(home_dir, "Documents", "TroopersSelect", "Output") combined_file = "1 - KM-PL Combined.xlsx" self.processing_thread = ProcessingThread(self.input_dir, self.file_pl, self.file_k, output_dir, combined_file) self.processing_thread.update_status.connect(self.label_status.setText) self.processing_thread.process_finished.connect(self.show_result_window) self.processing_thread.classification_report_signal.connect(self.set_classification_report) self.processing_thread.distribution_signal.connect(self.set_distribution) self.btn_execute.setEnabled(False) self.processing_thread.start() def set_classification_report(self, report): # Menyimpan laporan klasifikasi self.classification_report = report def set_distribution(self, distribution): # Menyimpan distribusi prediksi self.distribution = distribution def show_result_window(self): # Menampilkan jendela hasil self.result_window = ResultWindow(self.classification_report, self.distribution) self.result_window.show() self.btn_execute.setEnabled(True) if __name__ == "__main__": app = QApplication(sys.argv) window = MainWindow() window.show() sys.exit(app.exec())