From 3cf8861193a747d4eac686fc4b0888461b115279 Mon Sep 17 00:00:00 2001 From: 2110512160 <2110512160@mahasiswa.upnvj.ac.id> Date: Thu, 10 Jul 2025 10:25:15 +0000 Subject: [PATCH] Add py-final.py --- py-final.py | 460 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 460 insertions(+) create mode 100644 py-final.py diff --git a/py-final.py b/py-final.py new file mode 100644 index 0000000..1c437d1 --- /dev/null +++ b/py-final.py @@ -0,0 +1,460 @@ +import sys +import os +import pandas as pd +import joblib +from PyQt6.QtWidgets import (QApplication, QWidget, QPushButton, QFileDialog, QLabel, QVBoxLayout, QTableWidget, QTableWidgetItem) +from PyQt6.QtCore import Qt, QThread, pyqtSignal +from sklearn.model_selection import train_test_split +from sklearn.naive_bayes import CategoricalNB +from imblearn.over_sampling import SMOTE +from sklearn.metrics import accuracy_score, classification_report, confusion_matrix +from sklearn.preprocessing import LabelEncoder +import getpass + +def combine_excel_files(input_dir, output_dir, combined_file, log_var): + # Menggabungkan semua file Excel dalam direktori input menjadi satu file Excel + all_data = [] + + # Mendapatkan semua file Excel di direktori input + excel_files = [f for f in os.listdir(input_dir) if f.endswith(".xlsx")] + if not excel_files: + log_var.emit("Error: Direktori tidak berisi file Excel apa pun.") + return + + # Membuat direktori output jika belum ada + os.makedirs(output_dir, exist_ok=True) + output_path = os.path.join(output_dir, combined_file) + + for file in excel_files: + file_path = os.path.join(input_dir, file) + log_var.emit(f"Menggabungkan file: {file}") + xls = pd.ExcelFile(file_path) + + for sheet_name in xls.sheet_names: + df = pd.read_excel(xls, sheet_name=sheet_name) + df["Playground"] = file.replace(".xlsx", "") + + if 'Position' in df.columns: + df["Position"] = df["Position"].str.replace('_', '/', regex=False) + + all_data.append(df) + + # Menggabungkan semua data dari berbagai file Excel + combined_df = pd.concat(all_data, ignore_index=True) + combined_df.to_excel(output_path, index=False) + log_var.emit(f"Dataset yang digabungkan disimpan ke {output_path}") + +def show_columns(file_path, log_var): + # Menampilkan kolom yang ada dalam file Excel + try: + df = pd.read_excel(file_path) + log_var.emit(f"Kolom dalam {file_path}: {', '.join(df.columns.tolist())}") + except FileNotFoundError: + log_var.emit(f"Error: File tidak ditemukan - {file_path}") + sys.exit(1) + +def check_required_columns(file_path, required_columns, log_var): + # Memeriksa apakah kolom yang diperlukan ada dalam file Excel + try: + df = pd.read_excel(file_path) + missing_columns = [col for col in required_columns if col not in df.columns] + if missing_columns: + log_var.emit(f"Error: Kolom yang hilang dalam {file_path}: {', '.join(missing_columns)}") + sys.exit(1) + except FileNotFoundError: + log_var.emit(f"Error: File tidak ditemukan - {file_path}") + sys.exit(1) + +class ProcessingThread(QThread): + update_status = pyqtSignal(str) + process_finished = pyqtSignal() + classification_report_signal = pyqtSignal(str) + distribution_signal = pyqtSignal(str) + + def __init__(self, input_dir, file_pl, file_k, output_dir, combined_file): + super().__init__() + self.input_dir = input_dir + self.file_pl = file_pl + self.file_k = file_k + self.output_dir = output_dir + self.combined_file = combined_file + + def run(self): + try: + self.update_status.emit("Menggabungkan file Excel...") + combine_excel_files(self.input_dir, self.output_dir, self.combined_file, self.update_status) + + self.update_status.emit("Memuat data...") + df_cn = pd.read_excel(os.path.join(self.output_dir, self.combined_file)) + df_pl = pd.read_excel(self.file_pl) + df_k = pd.read_excel(self.file_k) + + # Mengubah semua nama kolom menjadi huruf kecil + df_cn.columns = df_cn.columns.str.lower() + df_pl.columns = df_pl.columns.str.lower() + df_k.columns = df_k.columns.str.lower() + + # Mengganti nama kolom untuk konsistensi + df_k.rename(columns={'nama universitas': 'university'}, inplace=True) + df_pl.rename(columns={'nama posisi di website kampus merdeka': 'position'}, inplace=True) + + # Menghapus spasi di awal dan akhir string dalam kolom tertentu + df_cn['university'] = df_cn['university'].str.strip() + df_pl['position'] = df_pl['position'].str.strip() + df_k['university'] = df_k['university'].str.strip() + + # Menghapus duplikat berdasarkan kolom tertentu + df_k = df_k.drop_duplicates(subset=['university']) + df_pl = df_pl.drop_duplicates(subset=['position']) + + # Menggabungkan data berdasarkan kolom 'university' dan 'position' + df_cn = pd.merge(df_cn, + df_k[['university', 'kota / kabupaten kampus', 'provinsi kampus', + 'preferensi region', 'preferensi area', 'kampus partner', + 'rank indonesia', 'kategori kampus']], + on='university', how='left') + + df_cn = pd.merge(df_cn, + df_pl[['position', 'nama kota lengkap', 'provinsi penugasan', + 'region', 'area', 'jurusan']], + on='position', how='left') + + # Mengganti nama kolom untuk kejelasan + df_cn.columns = [ + 'cn-id', 'cn-position', 'cn-univ', 'cn-jurusan', 'cn-ipk', + 'cn-semester', 'cn-tipekampus', 'cn-sertifikat organisasi 1', + 'cn-sertifikat organisasi 2', 'cn-sertifikat organisasi 3', + 'cn-sertifikat organisasi 4', 'cn-sertifikat organisasi 5','cn-recstatus', + 'cn-playground', 'k-kota', 'k-provinsi', 'k-region', + 'k-area', 'k-partner', 'k-rank', 'k-kategori', 'pl-kota', 'pl-provinsi', + 'pl-region', 'pl-area', 'pl-jurusan', + ] + + output_file_path = os.path.join(self.output_dir, "2 - Combined.xlsx") + df_cn.to_excel(output_file_path, index=False) + + self.update_status.emit(f"Data dimuat. Total baris: {len(df_cn)}. Memulai penilaian...") + + # Fungsi untuk menilai IPK + def score_ipk(cn_ipk): + if cn_ipk > 3.75: + return 'Perfect' + elif cn_ipk > 3.50: + return 'Great' + elif cn_ipk > 3.25: + return 'Good' + else: + return 'Poor' + + # Fungsi untuk menilai penempatan + def score_placement(row): + k_kampus = str(row['k-kota']) + pl_kota = str(row['pl-kota']) + k_region = str(row['k-region']) + pl_region = str(row['pl-region']) + k_provinsi = str(row['k-provinsi']) + pl_provinsi = str(row['pl-provinsi']) + k_area = str(row['k-area']) + pl_area = str(row['pl-area']) + + if k_kampus == pl_kota: + return 'Excellent' + elif pl_region in k_region: + return 'Perfect' + elif k_provinsi == pl_provinsi: + return 'Great' + elif pl_area in k_area: + return 'Good' + else: + return 'Poor' + + # Fungsi untuk menilai universitas + def score_university(k_partner, k_rank, k_kategori): + if k_partner == 'Ya' and k_rank in ['Top 10 PTN', 'Top 5 PTS']: + return 'Perfect' + elif k_partner == 'Ya' and k_rank == 'Regular': + return 'Great' + elif k_partner == 'Tidak' and k_rank in ['Top 10 PTN', 'Top 5 PTS']: + return 'Good' + elif k_kategori == 'PTN': + return 'Mid' + else: + return 'Poor' + + # Fungsi untuk menilai organisasi + def score_org(row): + org_count = sum(not pd.isna(row[col]) for col in ['cn-sertifikat organisasi 1', 'cn-sertifikat organisasi 2', + 'cn-sertifikat organisasi 3', 'cn-sertifikat organisasi 4', + 'cn-sertifikat organisasi 5']) + if org_count == 0: + return 'Poor' + elif org_count == 1: + return 'Good' + else: + return 'Great' + + # Fungsi untuk menilai kecocokan jurusan + def score_major_match(cn_jurusan, pl_jurusan): + if pd.isna(pl_jurusan): + return 'Poor' + + pl_jurusan = str(pl_jurusan).strip() + cn_jurusan = str(cn_jurusan).strip() + + if pl_jurusan == 'Semua Jurusan': + return 'Good' + + pl_jurusan_list = [word.strip().lower() for word in pl_jurusan.split(',')] + cn_jurusan_words = cn_jurusan.lower().split() + + if any(cn_word in pl_jurusan_list for cn_word in cn_jurusan_words): + return 'Great' + + return 'Poor' + + # Menilai setiap aspek dan menambahkan hasil penilaian ke dataframe + self.update_status.emit(f"Mengkategorikan IPK...") + df_cn[['ipk_class']] = df_cn['cn-ipk'].apply(lambda x: pd.Series(score_ipk(x))) + self.update_status.emit(f"Mengkategorikan Penempatan...") + df_cn[['placement_class']] = df_cn.apply(lambda row: pd.Series(score_placement(row)), axis=1) + self.update_status.emit(f"Mengkategorikan Universitas...") + df_cn[['univ_class']] = df_cn.apply(lambda row: pd.Series(score_university(row['k-partner'], row['k-rank'], row['k-kategori'])), axis=1) + self.update_status.emit(f"Mengkategorikan Organisasi...") + df_cn[['org_class']] = df_cn.apply(lambda row: pd.Series(score_org(row)), axis=1) + self.update_status.emit(f"Mengkategorikan Penilaian Jurusan...") + df_cn[['major_match_class']] = df_cn.apply(lambda row: pd.Series(score_major_match(row['cn-jurusan'], row['pl-jurusan'])), axis=1) + + self.update_status.emit(f"Penilaian selesai. Mengekspor...") + + scored_file_path = os.path.join(self.output_dir, "3 - Scored.xlsx") + df_cn.to_excel(scored_file_path, index=False) + + user_name = getpass.getuser() + + model_path = f'C:\\Users\\{user_name}\\Documents\\TroopersSelect\\Model\\AITrain.pkl' + label_encoders_path = f'C:\\Users\\{user_name}\\Documents\\TroopersSelect\\Model\\label_encoders.pkl' + data_path = scored_file_path + + model_dir = os.path.dirname(model_path) + if not os.path.exists(model_dir): + os.makedirs(model_dir) + self.update_status.emit(f"Direktori dibuat: {model_dir}") + + label_encoders = {} + if os.path.exists(model_path) and os.path.exists(label_encoders_path): + nb_model = joblib.load(model_path) + self.update_status.emit("Model pelatihan tersedia. Menggunakan model...") + + label_encoders = joblib.load(label_encoders_path) + self.update_status.emit("Label encoders dimuat.") + data = pd.read_excel(data_path) + + # Menggunakan label encoders yang dimuat untuk mentransformasi data + for column in ['ipk_class', 'placement_class', 'univ_class', 'org_class', 'major_match_class']: + le = label_encoders[column] # Memuat encoder yang ada + data[column] = le.transform(data[column]) # Transformasi menggunakan encoder yang dimuat + + X = data[['ipk_class', 'placement_class', 'univ_class', 'org_class', 'major_match_class']] + y = data['cn-recstatus'] + + # Menerapkan SMOTE ke seluruh dataset + smote = SMOTE(random_state=42) + X_resampled, y_resampled = smote.fit_resample(X, y) + + # Membagi data yang telah di-resample menjadi set pelatihan dan pengujian + X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42) + + # Melatih model + nb_model.fit(X_train, y_train) + + # Memprediksi dan mengevaluasi + y_pred = nb_model.predict(X_test) + report = classification_report(y_test, y_pred) + distribution = pd.Series(y_pred).value_counts().to_string() + self.classification_report_signal.emit(report) + self.distribution_signal.emit(distribution) + + + # Membuat dataframe baru untuk y_test, y_pred, dan fitur-fitur (X_test) + export_df = pd.DataFrame(X_test) # Menambahkan X_test ke dalam dataframe + export_df['y_test'] = y_test # Menambahkan y_test ke dalam dataframe + export_df['y_pred'] = y_pred # Menambahkan y_pred ke dalam dataframe + + # Menyimpan dataframe ke dalam file Excel + final_file_path = os.path.join(self.output_dir, "4 - Final.xlsx") + export_df.to_excel(final_file_path, index=False) + self.update_status.emit(f"4 - Final.xlsx telah berhasil diekspor ke {final_file_path}") + + + else: + data = pd.read_excel(data_path) + + # Membuat label encoders baru dan melatihnya + label_encoders = {} + for column in ['ipk_class', 'placement_class', 'univ_class', 'org_class', 'major_match_class']: + le = LabelEncoder() + data[column] = le.fit_transform(data[column]) + label_encoders[column] = le # Menyimpan encoder untuk penggunaan di masa depan + + X = data[['ipk_class', 'placement_class', 'univ_class', 'org_class', 'major_match_class']] + y = data['cn-recstatus'] + + # Menerapkan SMOTE ke seluruh dataset + smote = SMOTE(random_state=42) + X_resampled, y_resampled = smote.fit_resample(X, y) + + # Membagi data yang telah di-resample menjadi set pelatihan dan pengujian + X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42) + + # Melatih model + nb_model = CategoricalNB() + nb_model.fit(X_train, y_train) + + # Memprediksi dan mengevaluasi + y_pred = nb_model.predict(X_test) + self.update_status.emit(f"Akurasi: {accuracy_score(y_test, y_pred)}") + report = classification_report(y_test, y_pred) + distribution = pd.Series(y_pred).value_counts().to_string() + self.classification_report_signal.emit(report) + self.distribution_signal.emit(distribution) + + # Membuat dataframe baru untuk y_test, y_pred, dan fitur-fitur (X_test) + export_df = pd.DataFrame(X_test) # Menambahkan X_test ke dalam dataframe + export_df['y_test'] = y_test # Menambahkan y_test ke dalam dataframe + export_df['y_pred'] = y_pred # Menambahkan y_pred ke dalam dataframe + + # Menyimpan dataframe ke dalam file Excel + final_file_path = os.path.join(self.output_dir, "4 - Final.xlsx") + export_df.to_excel(final_file_path, index=False) + self.update_status.emit(f"4 - Final.xlsx telah berhasil diekspor ke {final_file_path}") + + # Menyimpan model dan label encoders + try: + joblib.dump(nb_model, model_path) + self.update_status.emit(f"Model disimpan ke {model_path}") + joblib.dump(label_encoders, label_encoders_path) + self.update_status.emit(f"Label encoders disimpan ke {label_encoders_path}") + except Exception as e: + self.update_status.emit(f"Error menyimpan label encoders: {str(e)}") + self.process_finished.emit() + except Exception as e: + self.update_status.emit(f"Error: {str(e)}") + self.process_finished.emit() + + +class ResultWindow(QWidget): + def __init__(self, classification_report, distribution): + super().__init__() + self.setWindowTitle("Classification Report and Distribution") + self.setGeometry(100, 100, 600, 400) # Ukuran disesuaikan + self.setFixedSize(500, 300) + + layout = QVBoxLayout() + + self.table = QTableWidget() + self.table.setColumnCount(2) + self.table.setHorizontalHeaderLabels(["Classification Report", "Distribution"]) + + report_lines = classification_report.split('\n') + distribution_lines = distribution.split('\n') + + max_rows = max(len(report_lines), len(distribution_lines)) + self.table.setRowCount(max_rows) + + # Mengatur lebar kolom + self.table.setColumnWidth(0, 300) # Lebar untuk laporan klasifikasi + self.table.setColumnWidth(1, 130) # Lebar untuk distribusi + + for i, line in enumerate(report_lines): + item = QTableWidgetItem(line) + item.setFlags(item.flags() & ~Qt.ItemFlag.ItemIsEditable) # Membuat teks tidak dapat diedit + self.table.setItem(i, 0, item) + + for i, line in enumerate(distribution_lines): + item = QTableWidgetItem(line) + item.setFlags(item.flags() & ~Qt.ItemFlag.ItemIsEditable) # Membuat teks tidak dapat diedit + self.table.setItem(i, 1, item) + + layout.addWidget(self.table) + self.setLayout(layout) + +class MainWindow(QWidget): + def __init__(self): + super().__init__() + self.initUI() + self.script_dir = os.path.dirname(__file__) + self.file_pl = os.path.join(self.script_dir, "pl.xlsx") + self.file_k = os.path.join(self.script_dir, "k.xlsx") + self.label_status.setText(f"Data Playground Dipilih: {self.file_pl}\nData Kampus Dipilih: {self.file_k}") + # Inisialisasi variabel untuk menghindari UnboundLocalError + self.classification_report = "" # Nilai default + self.distribution = "" # Nilai default + + def initUI(self): + self.setWindowTitle("TroopersSelect Data Processor") + self.setGeometry(100, 100, 600, 300) + self.setFixedSize(600, 300) + + layout = QVBoxLayout() + + self.label_status = QLabel("Pilih file dan folder untuk memulai.") + self.label_status.setAlignment(Qt.AlignmentFlag.AlignCenter) + + self.btn_select_folder = QPushButton("Pilih Folder Data") + self.btn_execute = QPushButton("Eksekusi") + + self.btn_select_folder.clicked.connect(self.select_folder) + self.btn_execute.clicked.connect(self.execute_process) + + layout.addWidget(self.label_status) + layout.addWidget(self.btn_select_folder) + layout.addWidget(self.btn_execute) + + self.setLayout(layout) + + self.input_dir = "" + + def select_folder(self): + # Memilih folder yang berisi data + folder = QFileDialog.getExistingDirectory(self, "Pilih Folder Data") + if folder: + self.input_dir = folder + self.label_status.setText(f"Folder Dipilih: {folder}\nData Playground Dipilih: {self.file_pl}\nData Kampus Dipilih: {self.file_k}") + + def execute_process(self): + # Mengeksekusi proses pengolahan data + if not self.input_dir: + self.label_status.setText("Error: Silakan pilih folder data.") + return + + self.label_status.setText("Memulai proses...") + home_dir = os.path.expanduser("~") + output_dir = os.path.join(home_dir, "Documents", "TroopersSelect", "Output") + combined_file = "1 - KM-PL Combined.xlsx" + self.processing_thread = ProcessingThread(self.input_dir, self.file_pl, self.file_k, output_dir, combined_file) + self.processing_thread.update_status.connect(self.label_status.setText) + self.processing_thread.process_finished.connect(self.show_result_window) + self.processing_thread.classification_report_signal.connect(self.set_classification_report) + self.processing_thread.distribution_signal.connect(self.set_distribution) + self.btn_execute.setEnabled(False) + self.processing_thread.start() + + def set_classification_report(self, report): + # Menyimpan laporan klasifikasi + self.classification_report = report + + def set_distribution(self, distribution): + # Menyimpan distribusi prediksi + self.distribution = distribution + + def show_result_window(self): + # Menampilkan jendela hasil + self.result_window = ResultWindow(self.classification_report, self.distribution) + self.result_window.show() + self.btn_execute.setEnabled(True) + +if __name__ == "__main__": + app = QApplication(sys.argv) + window = MainWindow() + window.show() + sys.exit(app.exec()) \ No newline at end of file