Add py-final.py

This commit is contained in:
2025-07-10 10:25:15 +00:00
commit 3cf8861193

460
py-final.py Normal file
View File

@ -0,0 +1,460 @@
import sys
import os
import pandas as pd
import joblib
from PyQt6.QtWidgets import (QApplication, QWidget, QPushButton, QFileDialog, QLabel, QVBoxLayout, QTableWidget, QTableWidgetItem)
from PyQt6.QtCore import Qt, QThread, pyqtSignal
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import CategoricalNB
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import getpass
def combine_excel_files(input_dir, output_dir, combined_file, log_var):
# Menggabungkan semua file Excel dalam direktori input menjadi satu file Excel
all_data = []
# Mendapatkan semua file Excel di direktori input
excel_files = [f for f in os.listdir(input_dir) if f.endswith(".xlsx")]
if not excel_files:
log_var.emit("Error: Direktori tidak berisi file Excel apa pun.")
return
# Membuat direktori output jika belum ada
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, combined_file)
for file in excel_files:
file_path = os.path.join(input_dir, file)
log_var.emit(f"Menggabungkan file: {file}")
xls = pd.ExcelFile(file_path)
for sheet_name in xls.sheet_names:
df = pd.read_excel(xls, sheet_name=sheet_name)
df["Playground"] = file.replace(".xlsx", "")
if 'Position' in df.columns:
df["Position"] = df["Position"].str.replace('_', '/', regex=False)
all_data.append(df)
# Menggabungkan semua data dari berbagai file Excel
combined_df = pd.concat(all_data, ignore_index=True)
combined_df.to_excel(output_path, index=False)
log_var.emit(f"Dataset yang digabungkan disimpan ke {output_path}")
def show_columns(file_path, log_var):
# Menampilkan kolom yang ada dalam file Excel
try:
df = pd.read_excel(file_path)
log_var.emit(f"Kolom dalam {file_path}: {', '.join(df.columns.tolist())}")
except FileNotFoundError:
log_var.emit(f"Error: File tidak ditemukan - {file_path}")
sys.exit(1)
def check_required_columns(file_path, required_columns, log_var):
# Memeriksa apakah kolom yang diperlukan ada dalam file Excel
try:
df = pd.read_excel(file_path)
missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
log_var.emit(f"Error: Kolom yang hilang dalam {file_path}: {', '.join(missing_columns)}")
sys.exit(1)
except FileNotFoundError:
log_var.emit(f"Error: File tidak ditemukan - {file_path}")
sys.exit(1)
class ProcessingThread(QThread):
update_status = pyqtSignal(str)
process_finished = pyqtSignal()
classification_report_signal = pyqtSignal(str)
distribution_signal = pyqtSignal(str)
def __init__(self, input_dir, file_pl, file_k, output_dir, combined_file):
super().__init__()
self.input_dir = input_dir
self.file_pl = file_pl
self.file_k = file_k
self.output_dir = output_dir
self.combined_file = combined_file
def run(self):
try:
self.update_status.emit("Menggabungkan file Excel...")
combine_excel_files(self.input_dir, self.output_dir, self.combined_file, self.update_status)
self.update_status.emit("Memuat data...")
df_cn = pd.read_excel(os.path.join(self.output_dir, self.combined_file))
df_pl = pd.read_excel(self.file_pl)
df_k = pd.read_excel(self.file_k)
# Mengubah semua nama kolom menjadi huruf kecil
df_cn.columns = df_cn.columns.str.lower()
df_pl.columns = df_pl.columns.str.lower()
df_k.columns = df_k.columns.str.lower()
# Mengganti nama kolom untuk konsistensi
df_k.rename(columns={'nama universitas': 'university'}, inplace=True)
df_pl.rename(columns={'nama posisi di website kampus merdeka': 'position'}, inplace=True)
# Menghapus spasi di awal dan akhir string dalam kolom tertentu
df_cn['university'] = df_cn['university'].str.strip()
df_pl['position'] = df_pl['position'].str.strip()
df_k['university'] = df_k['university'].str.strip()
# Menghapus duplikat berdasarkan kolom tertentu
df_k = df_k.drop_duplicates(subset=['university'])
df_pl = df_pl.drop_duplicates(subset=['position'])
# Menggabungkan data berdasarkan kolom 'university' dan 'position'
df_cn = pd.merge(df_cn,
df_k[['university', 'kota / kabupaten kampus', 'provinsi kampus',
'preferensi region', 'preferensi area', 'kampus partner',
'rank indonesia', 'kategori kampus']],
on='university', how='left')
df_cn = pd.merge(df_cn,
df_pl[['position', 'nama kota lengkap', 'provinsi penugasan',
'region', 'area', 'jurusan']],
on='position', how='left')
# Mengganti nama kolom untuk kejelasan
df_cn.columns = [
'cn-id', 'cn-position', 'cn-univ', 'cn-jurusan', 'cn-ipk',
'cn-semester', 'cn-tipekampus', 'cn-sertifikat organisasi 1',
'cn-sertifikat organisasi 2', 'cn-sertifikat organisasi 3',
'cn-sertifikat organisasi 4', 'cn-sertifikat organisasi 5','cn-recstatus',
'cn-playground', 'k-kota', 'k-provinsi', 'k-region',
'k-area', 'k-partner', 'k-rank', 'k-kategori', 'pl-kota', 'pl-provinsi',
'pl-region', 'pl-area', 'pl-jurusan',
]
output_file_path = os.path.join(self.output_dir, "2 - Combined.xlsx")
df_cn.to_excel(output_file_path, index=False)
self.update_status.emit(f"Data dimuat. Total baris: {len(df_cn)}. Memulai penilaian...")
# Fungsi untuk menilai IPK
def score_ipk(cn_ipk):
if cn_ipk > 3.75:
return 'Perfect'
elif cn_ipk > 3.50:
return 'Great'
elif cn_ipk > 3.25:
return 'Good'
else:
return 'Poor'
# Fungsi untuk menilai penempatan
def score_placement(row):
k_kampus = str(row['k-kota'])
pl_kota = str(row['pl-kota'])
k_region = str(row['k-region'])
pl_region = str(row['pl-region'])
k_provinsi = str(row['k-provinsi'])
pl_provinsi = str(row['pl-provinsi'])
k_area = str(row['k-area'])
pl_area = str(row['pl-area'])
if k_kampus == pl_kota:
return 'Excellent'
elif pl_region in k_region:
return 'Perfect'
elif k_provinsi == pl_provinsi:
return 'Great'
elif pl_area in k_area:
return 'Good'
else:
return 'Poor'
# Fungsi untuk menilai universitas
def score_university(k_partner, k_rank, k_kategori):
if k_partner == 'Ya' and k_rank in ['Top 10 PTN', 'Top 5 PTS']:
return 'Perfect'
elif k_partner == 'Ya' and k_rank == 'Regular':
return 'Great'
elif k_partner == 'Tidak' and k_rank in ['Top 10 PTN', 'Top 5 PTS']:
return 'Good'
elif k_kategori == 'PTN':
return 'Mid'
else:
return 'Poor'
# Fungsi untuk menilai organisasi
def score_org(row):
org_count = sum(not pd.isna(row[col]) for col in ['cn-sertifikat organisasi 1', 'cn-sertifikat organisasi 2',
'cn-sertifikat organisasi 3', 'cn-sertifikat organisasi 4',
'cn-sertifikat organisasi 5'])
if org_count == 0:
return 'Poor'
elif org_count == 1:
return 'Good'
else:
return 'Great'
# Fungsi untuk menilai kecocokan jurusan
def score_major_match(cn_jurusan, pl_jurusan):
if pd.isna(pl_jurusan):
return 'Poor'
pl_jurusan = str(pl_jurusan).strip()
cn_jurusan = str(cn_jurusan).strip()
if pl_jurusan == 'Semua Jurusan':
return 'Good'
pl_jurusan_list = [word.strip().lower() for word in pl_jurusan.split(',')]
cn_jurusan_words = cn_jurusan.lower().split()
if any(cn_word in pl_jurusan_list for cn_word in cn_jurusan_words):
return 'Great'
return 'Poor'
# Menilai setiap aspek dan menambahkan hasil penilaian ke dataframe
self.update_status.emit(f"Mengkategorikan IPK...")
df_cn[['ipk_class']] = df_cn['cn-ipk'].apply(lambda x: pd.Series(score_ipk(x)))
self.update_status.emit(f"Mengkategorikan Penempatan...")
df_cn[['placement_class']] = df_cn.apply(lambda row: pd.Series(score_placement(row)), axis=1)
self.update_status.emit(f"Mengkategorikan Universitas...")
df_cn[['univ_class']] = df_cn.apply(lambda row: pd.Series(score_university(row['k-partner'], row['k-rank'], row['k-kategori'])), axis=1)
self.update_status.emit(f"Mengkategorikan Organisasi...")
df_cn[['org_class']] = df_cn.apply(lambda row: pd.Series(score_org(row)), axis=1)
self.update_status.emit(f"Mengkategorikan Penilaian Jurusan...")
df_cn[['major_match_class']] = df_cn.apply(lambda row: pd.Series(score_major_match(row['cn-jurusan'], row['pl-jurusan'])), axis=1)
self.update_status.emit(f"Penilaian selesai. Mengekspor...")
scored_file_path = os.path.join(self.output_dir, "3 - Scored.xlsx")
df_cn.to_excel(scored_file_path, index=False)
user_name = getpass.getuser()
model_path = f'C:\\Users\\{user_name}\\Documents\\TroopersSelect\\Model\\AITrain.pkl'
label_encoders_path = f'C:\\Users\\{user_name}\\Documents\\TroopersSelect\\Model\\label_encoders.pkl'
data_path = scored_file_path
model_dir = os.path.dirname(model_path)
if not os.path.exists(model_dir):
os.makedirs(model_dir)
self.update_status.emit(f"Direktori dibuat: {model_dir}")
label_encoders = {}
if os.path.exists(model_path) and os.path.exists(label_encoders_path):
nb_model = joblib.load(model_path)
self.update_status.emit("Model pelatihan tersedia. Menggunakan model...")
label_encoders = joblib.load(label_encoders_path)
self.update_status.emit("Label encoders dimuat.")
data = pd.read_excel(data_path)
# Menggunakan label encoders yang dimuat untuk mentransformasi data
for column in ['ipk_class', 'placement_class', 'univ_class', 'org_class', 'major_match_class']:
le = label_encoders[column] # Memuat encoder yang ada
data[column] = le.transform(data[column]) # Transformasi menggunakan encoder yang dimuat
X = data[['ipk_class', 'placement_class', 'univ_class', 'org_class', 'major_match_class']]
y = data['cn-recstatus']
# Menerapkan SMOTE ke seluruh dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)
# Membagi data yang telah di-resample menjadi set pelatihan dan pengujian
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)
# Melatih model
nb_model.fit(X_train, y_train)
# Memprediksi dan mengevaluasi
y_pred = nb_model.predict(X_test)
report = classification_report(y_test, y_pred)
distribution = pd.Series(y_pred).value_counts().to_string()
self.classification_report_signal.emit(report)
self.distribution_signal.emit(distribution)
# Membuat dataframe baru untuk y_test, y_pred, dan fitur-fitur (X_test)
export_df = pd.DataFrame(X_test) # Menambahkan X_test ke dalam dataframe
export_df['y_test'] = y_test # Menambahkan y_test ke dalam dataframe
export_df['y_pred'] = y_pred # Menambahkan y_pred ke dalam dataframe
# Menyimpan dataframe ke dalam file Excel
final_file_path = os.path.join(self.output_dir, "4 - Final.xlsx")
export_df.to_excel(final_file_path, index=False)
self.update_status.emit(f"4 - Final.xlsx telah berhasil diekspor ke {final_file_path}")
else:
data = pd.read_excel(data_path)
# Membuat label encoders baru dan melatihnya
label_encoders = {}
for column in ['ipk_class', 'placement_class', 'univ_class', 'org_class', 'major_match_class']:
le = LabelEncoder()
data[column] = le.fit_transform(data[column])
label_encoders[column] = le # Menyimpan encoder untuk penggunaan di masa depan
X = data[['ipk_class', 'placement_class', 'univ_class', 'org_class', 'major_match_class']]
y = data['cn-recstatus']
# Menerapkan SMOTE ke seluruh dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)
# Membagi data yang telah di-resample menjadi set pelatihan dan pengujian
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)
# Melatih model
nb_model = CategoricalNB()
nb_model.fit(X_train, y_train)
# Memprediksi dan mengevaluasi
y_pred = nb_model.predict(X_test)
self.update_status.emit(f"Akurasi: {accuracy_score(y_test, y_pred)}")
report = classification_report(y_test, y_pred)
distribution = pd.Series(y_pred).value_counts().to_string()
self.classification_report_signal.emit(report)
self.distribution_signal.emit(distribution)
# Membuat dataframe baru untuk y_test, y_pred, dan fitur-fitur (X_test)
export_df = pd.DataFrame(X_test) # Menambahkan X_test ke dalam dataframe
export_df['y_test'] = y_test # Menambahkan y_test ke dalam dataframe
export_df['y_pred'] = y_pred # Menambahkan y_pred ke dalam dataframe
# Menyimpan dataframe ke dalam file Excel
final_file_path = os.path.join(self.output_dir, "4 - Final.xlsx")
export_df.to_excel(final_file_path, index=False)
self.update_status.emit(f"4 - Final.xlsx telah berhasil diekspor ke {final_file_path}")
# Menyimpan model dan label encoders
try:
joblib.dump(nb_model, model_path)
self.update_status.emit(f"Model disimpan ke {model_path}")
joblib.dump(label_encoders, label_encoders_path)
self.update_status.emit(f"Label encoders disimpan ke {label_encoders_path}")
except Exception as e:
self.update_status.emit(f"Error menyimpan label encoders: {str(e)}")
self.process_finished.emit()
except Exception as e:
self.update_status.emit(f"Error: {str(e)}")
self.process_finished.emit()
class ResultWindow(QWidget):
def __init__(self, classification_report, distribution):
super().__init__()
self.setWindowTitle("Classification Report and Distribution")
self.setGeometry(100, 100, 600, 400) # Ukuran disesuaikan
self.setFixedSize(500, 300)
layout = QVBoxLayout()
self.table = QTableWidget()
self.table.setColumnCount(2)
self.table.setHorizontalHeaderLabels(["Classification Report", "Distribution"])
report_lines = classification_report.split('\n')
distribution_lines = distribution.split('\n')
max_rows = max(len(report_lines), len(distribution_lines))
self.table.setRowCount(max_rows)
# Mengatur lebar kolom
self.table.setColumnWidth(0, 300) # Lebar untuk laporan klasifikasi
self.table.setColumnWidth(1, 130) # Lebar untuk distribusi
for i, line in enumerate(report_lines):
item = QTableWidgetItem(line)
item.setFlags(item.flags() & ~Qt.ItemFlag.ItemIsEditable) # Membuat teks tidak dapat diedit
self.table.setItem(i, 0, item)
for i, line in enumerate(distribution_lines):
item = QTableWidgetItem(line)
item.setFlags(item.flags() & ~Qt.ItemFlag.ItemIsEditable) # Membuat teks tidak dapat diedit
self.table.setItem(i, 1, item)
layout.addWidget(self.table)
self.setLayout(layout)
class MainWindow(QWidget):
def __init__(self):
super().__init__()
self.initUI()
self.script_dir = os.path.dirname(__file__)
self.file_pl = os.path.join(self.script_dir, "pl.xlsx")
self.file_k = os.path.join(self.script_dir, "k.xlsx")
self.label_status.setText(f"Data Playground Dipilih: {self.file_pl}\nData Kampus Dipilih: {self.file_k}")
# Inisialisasi variabel untuk menghindari UnboundLocalError
self.classification_report = "" # Nilai default
self.distribution = "" # Nilai default
def initUI(self):
self.setWindowTitle("TroopersSelect Data Processor")
self.setGeometry(100, 100, 600, 300)
self.setFixedSize(600, 300)
layout = QVBoxLayout()
self.label_status = QLabel("Pilih file dan folder untuk memulai.")
self.label_status.setAlignment(Qt.AlignmentFlag.AlignCenter)
self.btn_select_folder = QPushButton("Pilih Folder Data")
self.btn_execute = QPushButton("Eksekusi")
self.btn_select_folder.clicked.connect(self.select_folder)
self.btn_execute.clicked.connect(self.execute_process)
layout.addWidget(self.label_status)
layout.addWidget(self.btn_select_folder)
layout.addWidget(self.btn_execute)
self.setLayout(layout)
self.input_dir = ""
def select_folder(self):
# Memilih folder yang berisi data
folder = QFileDialog.getExistingDirectory(self, "Pilih Folder Data")
if folder:
self.input_dir = folder
self.label_status.setText(f"Folder Dipilih: {folder}\nData Playground Dipilih: {self.file_pl}\nData Kampus Dipilih: {self.file_k}")
def execute_process(self):
# Mengeksekusi proses pengolahan data
if not self.input_dir:
self.label_status.setText("Error: Silakan pilih folder data.")
return
self.label_status.setText("Memulai proses...")
home_dir = os.path.expanduser("~")
output_dir = os.path.join(home_dir, "Documents", "TroopersSelect", "Output")
combined_file = "1 - KM-PL Combined.xlsx"
self.processing_thread = ProcessingThread(self.input_dir, self.file_pl, self.file_k, output_dir, combined_file)
self.processing_thread.update_status.connect(self.label_status.setText)
self.processing_thread.process_finished.connect(self.show_result_window)
self.processing_thread.classification_report_signal.connect(self.set_classification_report)
self.processing_thread.distribution_signal.connect(self.set_distribution)
self.btn_execute.setEnabled(False)
self.processing_thread.start()
def set_classification_report(self, report):
# Menyimpan laporan klasifikasi
self.classification_report = report
def set_distribution(self, distribution):
# Menyimpan distribusi prediksi
self.distribution = distribution
def show_result_window(self):
# Menampilkan jendela hasil
self.result_window = ResultWindow(self.classification_report, self.distribution)
self.result_window.show()
self.btn_execute.setEnabled(True)
if __name__ == "__main__":
app = QApplication(sys.argv)
window = MainWindow()
window.show()
sys.exit(app.exec())