Files
TroopersSelect/py-final.py
2025-07-10 10:25:15 +00:00

460 lines
21 KiB
Python

import sys
import os
import pandas as pd
import joblib
from PyQt6.QtWidgets import (QApplication, QWidget, QPushButton, QFileDialog, QLabel, QVBoxLayout, QTableWidget, QTableWidgetItem)
from PyQt6.QtCore import Qt, QThread, pyqtSignal
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import CategoricalNB
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import getpass
def combine_excel_files(input_dir, output_dir, combined_file, log_var):
# Menggabungkan semua file Excel dalam direktori input menjadi satu file Excel
all_data = []
# Mendapatkan semua file Excel di direktori input
excel_files = [f for f in os.listdir(input_dir) if f.endswith(".xlsx")]
if not excel_files:
log_var.emit("Error: Direktori tidak berisi file Excel apa pun.")
return
# Membuat direktori output jika belum ada
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, combined_file)
for file in excel_files:
file_path = os.path.join(input_dir, file)
log_var.emit(f"Menggabungkan file: {file}")
xls = pd.ExcelFile(file_path)
for sheet_name in xls.sheet_names:
df = pd.read_excel(xls, sheet_name=sheet_name)
df["Playground"] = file.replace(".xlsx", "")
if 'Position' in df.columns:
df["Position"] = df["Position"].str.replace('_', '/', regex=False)
all_data.append(df)
# Menggabungkan semua data dari berbagai file Excel
combined_df = pd.concat(all_data, ignore_index=True)
combined_df.to_excel(output_path, index=False)
log_var.emit(f"Dataset yang digabungkan disimpan ke {output_path}")
def show_columns(file_path, log_var):
# Menampilkan kolom yang ada dalam file Excel
try:
df = pd.read_excel(file_path)
log_var.emit(f"Kolom dalam {file_path}: {', '.join(df.columns.tolist())}")
except FileNotFoundError:
log_var.emit(f"Error: File tidak ditemukan - {file_path}")
sys.exit(1)
def check_required_columns(file_path, required_columns, log_var):
# Memeriksa apakah kolom yang diperlukan ada dalam file Excel
try:
df = pd.read_excel(file_path)
missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
log_var.emit(f"Error: Kolom yang hilang dalam {file_path}: {', '.join(missing_columns)}")
sys.exit(1)
except FileNotFoundError:
log_var.emit(f"Error: File tidak ditemukan - {file_path}")
sys.exit(1)
class ProcessingThread(QThread):
update_status = pyqtSignal(str)
process_finished = pyqtSignal()
classification_report_signal = pyqtSignal(str)
distribution_signal = pyqtSignal(str)
def __init__(self, input_dir, file_pl, file_k, output_dir, combined_file):
super().__init__()
self.input_dir = input_dir
self.file_pl = file_pl
self.file_k = file_k
self.output_dir = output_dir
self.combined_file = combined_file
def run(self):
try:
self.update_status.emit("Menggabungkan file Excel...")
combine_excel_files(self.input_dir, self.output_dir, self.combined_file, self.update_status)
self.update_status.emit("Memuat data...")
df_cn = pd.read_excel(os.path.join(self.output_dir, self.combined_file))
df_pl = pd.read_excel(self.file_pl)
df_k = pd.read_excel(self.file_k)
# Mengubah semua nama kolom menjadi huruf kecil
df_cn.columns = df_cn.columns.str.lower()
df_pl.columns = df_pl.columns.str.lower()
df_k.columns = df_k.columns.str.lower()
# Mengganti nama kolom untuk konsistensi
df_k.rename(columns={'nama universitas': 'university'}, inplace=True)
df_pl.rename(columns={'nama posisi di website kampus merdeka': 'position'}, inplace=True)
# Menghapus spasi di awal dan akhir string dalam kolom tertentu
df_cn['university'] = df_cn['university'].str.strip()
df_pl['position'] = df_pl['position'].str.strip()
df_k['university'] = df_k['university'].str.strip()
# Menghapus duplikat berdasarkan kolom tertentu
df_k = df_k.drop_duplicates(subset=['university'])
df_pl = df_pl.drop_duplicates(subset=['position'])
# Menggabungkan data berdasarkan kolom 'university' dan 'position'
df_cn = pd.merge(df_cn,
df_k[['university', 'kota / kabupaten kampus', 'provinsi kampus',
'preferensi region', 'preferensi area', 'kampus partner',
'rank indonesia', 'kategori kampus']],
on='university', how='left')
df_cn = pd.merge(df_cn,
df_pl[['position', 'nama kota lengkap', 'provinsi penugasan',
'region', 'area', 'jurusan']],
on='position', how='left')
# Mengganti nama kolom untuk kejelasan
df_cn.columns = [
'cn-id', 'cn-position', 'cn-univ', 'cn-jurusan', 'cn-ipk',
'cn-semester', 'cn-tipekampus', 'cn-sertifikat organisasi 1',
'cn-sertifikat organisasi 2', 'cn-sertifikat organisasi 3',
'cn-sertifikat organisasi 4', 'cn-sertifikat organisasi 5','cn-recstatus',
'cn-playground', 'k-kota', 'k-provinsi', 'k-region',
'k-area', 'k-partner', 'k-rank', 'k-kategori', 'pl-kota', 'pl-provinsi',
'pl-region', 'pl-area', 'pl-jurusan',
]
output_file_path = os.path.join(self.output_dir, "2 - Combined.xlsx")
df_cn.to_excel(output_file_path, index=False)
self.update_status.emit(f"Data dimuat. Total baris: {len(df_cn)}. Memulai penilaian...")
# Fungsi untuk menilai IPK
def score_ipk(cn_ipk):
if cn_ipk > 3.75:
return 'Perfect'
elif cn_ipk > 3.50:
return 'Great'
elif cn_ipk > 3.25:
return 'Good'
else:
return 'Poor'
# Fungsi untuk menilai penempatan
def score_placement(row):
k_kampus = str(row['k-kota'])
pl_kota = str(row['pl-kota'])
k_region = str(row['k-region'])
pl_region = str(row['pl-region'])
k_provinsi = str(row['k-provinsi'])
pl_provinsi = str(row['pl-provinsi'])
k_area = str(row['k-area'])
pl_area = str(row['pl-area'])
if k_kampus == pl_kota:
return 'Excellent'
elif pl_region in k_region:
return 'Perfect'
elif k_provinsi == pl_provinsi:
return 'Great'
elif pl_area in k_area:
return 'Good'
else:
return 'Poor'
# Fungsi untuk menilai universitas
def score_university(k_partner, k_rank, k_kategori):
if k_partner == 'Ya' and k_rank in ['Top 10 PTN', 'Top 5 PTS']:
return 'Perfect'
elif k_partner == 'Ya' and k_rank == 'Regular':
return 'Great'
elif k_partner == 'Tidak' and k_rank in ['Top 10 PTN', 'Top 5 PTS']:
return 'Good'
elif k_kategori == 'PTN':
return 'Mid'
else:
return 'Poor'
# Fungsi untuk menilai organisasi
def score_org(row):
org_count = sum(not pd.isna(row[col]) for col in ['cn-sertifikat organisasi 1', 'cn-sertifikat organisasi 2',
'cn-sertifikat organisasi 3', 'cn-sertifikat organisasi 4',
'cn-sertifikat organisasi 5'])
if org_count == 0:
return 'Poor'
elif org_count == 1:
return 'Good'
else:
return 'Great'
# Fungsi untuk menilai kecocokan jurusan
def score_major_match(cn_jurusan, pl_jurusan):
if pd.isna(pl_jurusan):
return 'Poor'
pl_jurusan = str(pl_jurusan).strip()
cn_jurusan = str(cn_jurusan).strip()
if pl_jurusan == 'Semua Jurusan':
return 'Good'
pl_jurusan_list = [word.strip().lower() for word in pl_jurusan.split(',')]
cn_jurusan_words = cn_jurusan.lower().split()
if any(cn_word in pl_jurusan_list for cn_word in cn_jurusan_words):
return 'Great'
return 'Poor'
# Menilai setiap aspek dan menambahkan hasil penilaian ke dataframe
self.update_status.emit(f"Mengkategorikan IPK...")
df_cn[['ipk_class']] = df_cn['cn-ipk'].apply(lambda x: pd.Series(score_ipk(x)))
self.update_status.emit(f"Mengkategorikan Penempatan...")
df_cn[['placement_class']] = df_cn.apply(lambda row: pd.Series(score_placement(row)), axis=1)
self.update_status.emit(f"Mengkategorikan Universitas...")
df_cn[['univ_class']] = df_cn.apply(lambda row: pd.Series(score_university(row['k-partner'], row['k-rank'], row['k-kategori'])), axis=1)
self.update_status.emit(f"Mengkategorikan Organisasi...")
df_cn[['org_class']] = df_cn.apply(lambda row: pd.Series(score_org(row)), axis=1)
self.update_status.emit(f"Mengkategorikan Penilaian Jurusan...")
df_cn[['major_match_class']] = df_cn.apply(lambda row: pd.Series(score_major_match(row['cn-jurusan'], row['pl-jurusan'])), axis=1)
self.update_status.emit(f"Penilaian selesai. Mengekspor...")
scored_file_path = os.path.join(self.output_dir, "3 - Scored.xlsx")
df_cn.to_excel(scored_file_path, index=False)
user_name = getpass.getuser()
model_path = f'C:\\Users\\{user_name}\\Documents\\TroopersSelect\\Model\\AITrain.pkl'
label_encoders_path = f'C:\\Users\\{user_name}\\Documents\\TroopersSelect\\Model\\label_encoders.pkl'
data_path = scored_file_path
model_dir = os.path.dirname(model_path)
if not os.path.exists(model_dir):
os.makedirs(model_dir)
self.update_status.emit(f"Direktori dibuat: {model_dir}")
label_encoders = {}
if os.path.exists(model_path) and os.path.exists(label_encoders_path):
nb_model = joblib.load(model_path)
self.update_status.emit("Model pelatihan tersedia. Menggunakan model...")
label_encoders = joblib.load(label_encoders_path)
self.update_status.emit("Label encoders dimuat.")
data = pd.read_excel(data_path)
# Menggunakan label encoders yang dimuat untuk mentransformasi data
for column in ['ipk_class', 'placement_class', 'univ_class', 'org_class', 'major_match_class']:
le = label_encoders[column] # Memuat encoder yang ada
data[column] = le.transform(data[column]) # Transformasi menggunakan encoder yang dimuat
X = data[['ipk_class', 'placement_class', 'univ_class', 'org_class', 'major_match_class']]
y = data['cn-recstatus']
# Menerapkan SMOTE ke seluruh dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)
# Membagi data yang telah di-resample menjadi set pelatihan dan pengujian
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)
# Melatih model
nb_model.fit(X_train, y_train)
# Memprediksi dan mengevaluasi
y_pred = nb_model.predict(X_test)
report = classification_report(y_test, y_pred)
distribution = pd.Series(y_pred).value_counts().to_string()
self.classification_report_signal.emit(report)
self.distribution_signal.emit(distribution)
# Membuat dataframe baru untuk y_test, y_pred, dan fitur-fitur (X_test)
export_df = pd.DataFrame(X_test) # Menambahkan X_test ke dalam dataframe
export_df['y_test'] = y_test # Menambahkan y_test ke dalam dataframe
export_df['y_pred'] = y_pred # Menambahkan y_pred ke dalam dataframe
# Menyimpan dataframe ke dalam file Excel
final_file_path = os.path.join(self.output_dir, "4 - Final.xlsx")
export_df.to_excel(final_file_path, index=False)
self.update_status.emit(f"4 - Final.xlsx telah berhasil diekspor ke {final_file_path}")
else:
data = pd.read_excel(data_path)
# Membuat label encoders baru dan melatihnya
label_encoders = {}
for column in ['ipk_class', 'placement_class', 'univ_class', 'org_class', 'major_match_class']:
le = LabelEncoder()
data[column] = le.fit_transform(data[column])
label_encoders[column] = le # Menyimpan encoder untuk penggunaan di masa depan
X = data[['ipk_class', 'placement_class', 'univ_class', 'org_class', 'major_match_class']]
y = data['cn-recstatus']
# Menerapkan SMOTE ke seluruh dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)
# Membagi data yang telah di-resample menjadi set pelatihan dan pengujian
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)
# Melatih model
nb_model = CategoricalNB()
nb_model.fit(X_train, y_train)
# Memprediksi dan mengevaluasi
y_pred = nb_model.predict(X_test)
self.update_status.emit(f"Akurasi: {accuracy_score(y_test, y_pred)}")
report = classification_report(y_test, y_pred)
distribution = pd.Series(y_pred).value_counts().to_string()
self.classification_report_signal.emit(report)
self.distribution_signal.emit(distribution)
# Membuat dataframe baru untuk y_test, y_pred, dan fitur-fitur (X_test)
export_df = pd.DataFrame(X_test) # Menambahkan X_test ke dalam dataframe
export_df['y_test'] = y_test # Menambahkan y_test ke dalam dataframe
export_df['y_pred'] = y_pred # Menambahkan y_pred ke dalam dataframe
# Menyimpan dataframe ke dalam file Excel
final_file_path = os.path.join(self.output_dir, "4 - Final.xlsx")
export_df.to_excel(final_file_path, index=False)
self.update_status.emit(f"4 - Final.xlsx telah berhasil diekspor ke {final_file_path}")
# Menyimpan model dan label encoders
try:
joblib.dump(nb_model, model_path)
self.update_status.emit(f"Model disimpan ke {model_path}")
joblib.dump(label_encoders, label_encoders_path)
self.update_status.emit(f"Label encoders disimpan ke {label_encoders_path}")
except Exception as e:
self.update_status.emit(f"Error menyimpan label encoders: {str(e)}")
self.process_finished.emit()
except Exception as e:
self.update_status.emit(f"Error: {str(e)}")
self.process_finished.emit()
class ResultWindow(QWidget):
def __init__(self, classification_report, distribution):
super().__init__()
self.setWindowTitle("Classification Report and Distribution")
self.setGeometry(100, 100, 600, 400) # Ukuran disesuaikan
self.setFixedSize(500, 300)
layout = QVBoxLayout()
self.table = QTableWidget()
self.table.setColumnCount(2)
self.table.setHorizontalHeaderLabels(["Classification Report", "Distribution"])
report_lines = classification_report.split('\n')
distribution_lines = distribution.split('\n')
max_rows = max(len(report_lines), len(distribution_lines))
self.table.setRowCount(max_rows)
# Mengatur lebar kolom
self.table.setColumnWidth(0, 300) # Lebar untuk laporan klasifikasi
self.table.setColumnWidth(1, 130) # Lebar untuk distribusi
for i, line in enumerate(report_lines):
item = QTableWidgetItem(line)
item.setFlags(item.flags() & ~Qt.ItemFlag.ItemIsEditable) # Membuat teks tidak dapat diedit
self.table.setItem(i, 0, item)
for i, line in enumerate(distribution_lines):
item = QTableWidgetItem(line)
item.setFlags(item.flags() & ~Qt.ItemFlag.ItemIsEditable) # Membuat teks tidak dapat diedit
self.table.setItem(i, 1, item)
layout.addWidget(self.table)
self.setLayout(layout)
class MainWindow(QWidget):
def __init__(self):
super().__init__()
self.initUI()
self.script_dir = os.path.dirname(__file__)
self.file_pl = os.path.join(self.script_dir, "pl.xlsx")
self.file_k = os.path.join(self.script_dir, "k.xlsx")
self.label_status.setText(f"Data Playground Dipilih: {self.file_pl}\nData Kampus Dipilih: {self.file_k}")
# Inisialisasi variabel untuk menghindari UnboundLocalError
self.classification_report = "" # Nilai default
self.distribution = "" # Nilai default
def initUI(self):
self.setWindowTitle("TroopersSelect Data Processor")
self.setGeometry(100, 100, 600, 300)
self.setFixedSize(600, 300)
layout = QVBoxLayout()
self.label_status = QLabel("Pilih file dan folder untuk memulai.")
self.label_status.setAlignment(Qt.AlignmentFlag.AlignCenter)
self.btn_select_folder = QPushButton("Pilih Folder Data")
self.btn_execute = QPushButton("Eksekusi")
self.btn_select_folder.clicked.connect(self.select_folder)
self.btn_execute.clicked.connect(self.execute_process)
layout.addWidget(self.label_status)
layout.addWidget(self.btn_select_folder)
layout.addWidget(self.btn_execute)
self.setLayout(layout)
self.input_dir = ""
def select_folder(self):
# Memilih folder yang berisi data
folder = QFileDialog.getExistingDirectory(self, "Pilih Folder Data")
if folder:
self.input_dir = folder
self.label_status.setText(f"Folder Dipilih: {folder}\nData Playground Dipilih: {self.file_pl}\nData Kampus Dipilih: {self.file_k}")
def execute_process(self):
# Mengeksekusi proses pengolahan data
if not self.input_dir:
self.label_status.setText("Error: Silakan pilih folder data.")
return
self.label_status.setText("Memulai proses...")
home_dir = os.path.expanduser("~")
output_dir = os.path.join(home_dir, "Documents", "TroopersSelect", "Output")
combined_file = "1 - KM-PL Combined.xlsx"
self.processing_thread = ProcessingThread(self.input_dir, self.file_pl, self.file_k, output_dir, combined_file)
self.processing_thread.update_status.connect(self.label_status.setText)
self.processing_thread.process_finished.connect(self.show_result_window)
self.processing_thread.classification_report_signal.connect(self.set_classification_report)
self.processing_thread.distribution_signal.connect(self.set_distribution)
self.btn_execute.setEnabled(False)
self.processing_thread.start()
def set_classification_report(self, report):
# Menyimpan laporan klasifikasi
self.classification_report = report
def set_distribution(self, distribution):
# Menyimpan distribusi prediksi
self.distribution = distribution
def show_result_window(self):
# Menampilkan jendela hasil
self.result_window = ResultWindow(self.classification_report, self.distribution)
self.result_window.show()
self.btn_execute.setEnabled(True)
if __name__ == "__main__":
app = QApplication(sys.argv)
window = MainWindow()
window.show()
sys.exit(app.exec())