# -*- coding: utf-8 -*- import pandas as pd import numpy as np import os import platform import chardet from datetime import datetime import tkinter as tk from tkinter import filedialog, messagebox, ttk import matplotlib import matplotlib.pyplot as plt from docx import Document from docx.shared import Inches from docx.enum.text import WD_ALIGN_PARAGRAPH import tempfile from sklearn.ensemble import IsolationForest from sklearn.preprocessing import StandardScaler # Настройка бэкенда для macOS if platform.system() == 'Darwin': matplotlib.use('TkAgg') class BookDataCleanerApp: def __init__(self, root): self.root = root self.root.title("Очистка данных об образовании") self.input_file_path = None self.cleaned_df = None self.original_df = None self.anomalies_df = None self.setup_ui() self.adapt_platform() def adapt_platform(self): if platform.system() == 'Darwin': # macOS os.environ['TK_SILENCE_DEPRECATION'] = '1' self.root.tk.call('tk', 'scaling', 1.5) style = ttk.Style() style.theme_use('clam') style.configure('TButton', padding=6) if int(platform.mac_ver()[0].split('.')[0]) >= 12: self.root.tk.call('set', '::tk::mac::useThemeFont', '1') else: style = ttk.Style() style.theme_use('vista' if platform.system() == 'Windows' else 'clam') style.configure('TButton', padding=4) def setup_ui(self): # Светло-голубой фон всего окна self.root.configure(bg='#e6f3ff') self.root.geometry("1100x800") header_frame = tk.Frame(self.root, bg='#e6f3ff', pady=15) header_frame.pack(fill='x') tk.Label( header_frame, text="📚 Очистка данных об образовании", font=('Segoe UI', 18, 'bold'), bg='#e6f3ff', fg='black' # Чёрный текст ).pack(pady=5) tk.Label( header_frame, text="Настройте параметры очистки слева, используйте кнопки справа", font=('Segoe UI', 10, 'italic'), bg='#e6f3ff', fg='black' ).pack() main_frame = tk.Frame(self.root, bg='#e6f3ff') main_frame.pack(fill='both', expand=True, padx=20, pady=10) left_panel = tk.Frame(main_frame, bg='#ffffff', padx=15, pady=15, relief='groove', bd=1) left_panel.pack(side='left', fill='both', expand=True, padx=(0, 10)) right_panel = tk.Frame(main_frame, bg='#ffffff', padx=15, pady=15, relief='groove', bd=1) right_panel.pack(side='right', fill='y', padx=(10, 0)) self.create_file_selection_block(left_panel) self.create_cleaning_settings_block(left_panel) self.create_progress_block(left_panel) self.create_log_block(left_panel) self.create_controls_block(right_panel) def create_file_selection_block(self, parent): frame = tk.LabelFrame(parent, text="Исходные данные", bg='#ffffff', padx=10, pady=10) frame.pack(fill='x', pady=(0, 10)) row = tk.Frame(frame, bg='#ffffff') row.pack(fill='x', pady=5) tk.Label(row, text="Выберите файл:", bg='#ffffff', fg='black').pack(side='left') self.browse_btn = tk.Button( row, text="Обзор...", command=self.select_file, bg='#d3d3d3', # Светло-серый fg='black', # Чёрный текст relief='flat' ) self.browse_btn.pack(side='right', padx=5) self.file_path_entry = ttk.Entry(frame, width=40) self.file_path_entry.pack(fill='x', pady=5) self.file_path_entry.configure(state='readonly') def create_cleaning_settings_block(self, parent): frame = tk.LabelFrame(parent, text="Параметры очистки", bg='#ffffff', padx=10, pady=10) frame.pack(fill='x', pady=(0, 10)) num_frame = tk.LabelFrame(frame, text="Числовые данные", bg='#ffffff', padx=8, pady=8) num_frame.pack(fill='x', pady=5) self.numeric_missing_var = tk.StringVar(value='0') ttk.Radiobutton(num_frame, text="Пропуски → 0", variable=self.numeric_missing_var, value='0').pack(anchor='w') ttk.Radiobutton(num_frame, text="Пропуски → медиана", variable=self.numeric_missing_var, value='median').pack(anchor='w') self.numeric_type_var = tk.StringVar(value='int') ttk.Radiobutton(num_frame, text="Тип: целые (int)", variable=self.numeric_type_var, value='int').pack(anchor='w') ttk.Radiobutton(num_frame, text="Тип: вещественные (float)", variable=self.numeric_type_var, value='float').pack(anchor='w') cat_frame = tk.LabelFrame(frame, text="Категориальные данные", bg='#ffffff', padx=8, pady=8) cat_frame.pack(fill='x', pady=5) self.categorical_missing_var = tk.StringVar(value='Не указано') ttk.Radiobutton(cat_frame, text="Пропуски → «Не указано»", variable=self.categorical_missing_var, value='Не указано').pack(anchor='w') ttk.Radiobutton(cat_frame, text="Пропуски → «Unknown»", variable=self.categorical_missing_var, value='Unknown').pack(anchor='w') ttk.Radiobutton(cat_frame, text="Пропуски → «—»", variable=self.categorical_missing_var, value='—').pack(anchor='w') time_frame = tk.LabelFrame(frame, text="Временные данные (Year)", bg='#ffffff', padx=8, pady=8) time_frame.pack(fill='x', pady=5) range_frame = tk.Frame(time_frame, bg='#ffffff') range_frame.pack(anchor='w') tk.Label(range_frame, text="Диапазон лет:", bg='#ffffff', fg='black').pack(side='left') self.year_min_var = tk.StringVar(value='2014') ttk.Entry(range_frame, textvariable=self.year_min_var, width=6).pack(side='left', padx=5) tk.Label(range_frame, text="–", bg='#ffffff', fg='black').pack(side='left') self.year_max_var = tk.StringVar(value='2023') ttk.Entry(range_frame, textvariable=self.year_max_var, width=6).pack(side='left', padx=5) def create_progress_block(self, parent): frame = tk.Frame(parent, bg='#ffffff') frame.pack(fill='x', pady=(10, 5)) self.progress = ttk.Progressbar(frame, mode='determinate', length=250) self.progress.pack(pady=5) self.status_label = tk.Label(frame, text="Готово", bg='#ffffff', fg='black') self.status_label.pack() def create_log_block(self, parent): frame = tk.LabelFrame(parent, text="Лог анализа", bg='#ffffff', padx=10, pady=10) frame.pack(fill='both', expand=True, pady=(10, 0)) self.log_text = tk.Text( frame, bg='#f0f8ff', # Светлый фон лога fg='black', insertbackground='black', font=('Consolas', 9), height=10, wrap='none' ) self.log_text.pack(fill='both', expand=True) scrollbar = ttk.Scrollbar(frame, orient='vertical', command=self.log_text.yview) scrollbar.pack(side='right', fill='y') self.log_text.configure(yscrollcommand=scrollbar.set) def create_controls_block(self, parent): self.auto_save_var = tk.BooleanVar(value=True) chk = ttk.Checkbutton( parent, text="Автосохранение\nв исходную папку", variable=self.auto_save_var, onvalue=True, offvalue=False ) chk.pack(pady=(0, 15)) btn_frame_1 = tk.Frame(parent, bg='#ffffff') btn_frame_1.pack(pady=5) self.clean_btn = tk.Button( btn_frame_1, text="🧹 Начать анализ", command=self.clean_data, bg='#d3d3d3', # Светло-серый fg='black', # Чёрный текст font=('Arial', 10, 'bold'), width=22, height=2 ) self.clean_btn.pack(pady=2) self.clean_btn.configure(state='disabled') self.export_btn = tk.Button( btn_frame_1, text="📄 Экспорт данных в Word", command=self.export_to_word, bg='#d3d3d3', fg='black', font=('Arial', 10, 'bold'), width=22, height=2, state='disabled' ) self.export_btn.pack(pady=2) self.view_btn = tk.Button( btn_frame_1, text="🔍 Просмотр и сортировка данных", command=self.view_data, bg='#d3d3d3', fg='black', font=('Arial', 10, 'bold'), width=22, height=2, state='disabled' ) self.view_btn.pack(pady=2) btn_frame_2 = tk.Frame(parent, bg='#ffffff') btn_frame_2.pack(pady=(20, 5)) self.anomalies_btn = tk.Button( btn_frame_2, text="⚠️ Выбросы и аномалии", command=self.view_anomalies, bg='#d3d3d3', fg='black', font=('Arial', 10, 'bold'), width=22, height=2, state='disabled' ) self.anomalies_btn.pack(pady=2) self.save_anomalies_btn = tk.Button( btn_frame_2, text="💾 Отчет по выбросам\nи аномалиям", command=self.save_anomalies_report, bg='#d3d3d3', fg='black', font=('Arial', 10, 'bold'), width=22, height=2, state='disabled' ) self.save_anomalies_btn.pack(pady=2) def log(self, message, tag=''): if not self.log_text: return self.log_text.insert('end', f"{datetime.now().strftime('%H:%M:%S')} - {message}\n") self.log_text.see('end') self.root.update_idletasks() def select_file(self): file_path = filedialog.askopenfilename( title="Выберите файл с данными", filetypes=[ ("CSV файлы", "*.csv"), ("Excel файлы", "*.xlsx *.xls"), ("Текстовые файлы", "*.txt"), ("Все файлы", "*.*") ] ) if file_path: self.input_file_path = file_path self.file_path_entry.configure(state='normal') self.file_path_entry.delete(0, 'end') self.file_path_entry.insert(0, file_path) self.file_path_entry.configure(state='readonly') self.clean_btn.configure(state='normal') self.log(f"Файл выбран: {os.path.basename(file_path)}") def get_output_path(self, suffix="", extension=None): if self.input_file_path: dir_path = os.path.dirname(self.input_file_path) base = os.path.splitext(os.path.basename(self.input_file_path))[0] if extension is None: extension = os.path.splitext(self.input_file_path)[1] return os.path.join(dir_path, f"{base}{suffix}{extension}") return f"cleaned_books{suffix}.csv" def detect_encoding(self, file_path): try: with open(file_path, 'rb') as f: raw_data = f.read() result = chardet.detect(raw_data) encoding = result['encoding'] confidence = result['confidence'] if confidence < 0.7: fallback = 'utf-8' if 'UTF' in encoding else 'windows-1251' self.log(f"🔍 Низкая уверенность ({confidence:.2f}), используется {fallback}") return fallback self.log(f"🔍 Кодировка: {encoding} (уверенность: {confidence:.2f})") return encoding except Exception as e: self.log(f"⚠️ Ошибка кодировки: {e}") return 'utf-8' def load_data(self, file_ext): try: if file_ext in ['.xls', '.xlsx']: df = pd.read_excel(self.input_file_path, engine='openpyxl') else: encoding = self.detect_encoding(self.input_file_path) for delimiter in [',', ';', '\t']: try: df = pd.read_csv(self.input_file_path, encoding=encoding, delimiter=delimiter) if len(df.columns) > 1: self.log(f"✅ Загружено с разделителем '{delimiter}'") return df except: continue df = pd.read_csv(self.input_file_path, encoding=encoding, sep=None, engine='python') self.log(f"✅ Успешно загружено {len(df)} записей") self.progress['value'] = 25 self.original_df = df.copy() return df except Exception as e: raise RuntimeError(f"Ошибка загрузки: {str(e)}") def clean_numeric_column(self, series): series = pd.to_numeric(series, errors='coerce') missing_strategy = self.numeric_missing_var.get() if missing_strategy == '0': series = series.fillna(0) elif missing_strategy == 'median': median_val = series.median() series = series.fillna(median_val if pd.notna(median_val) else 0) dtype = self.numeric_type_var.get() if dtype == 'int': series = series.round().astype('Int64') else: series = series.astype('float64') return series def clean_categorical_column(self, series): series = series.astype(str).replace(['nan', 'None', 'NULL', ''], pd.NA) fill_value = self.categorical_missing_var.get() return series.fillna(fill_value).str.strip() def clean_year_column(self, series): series = pd.to_numeric(series, errors='coerce') try: year_min = int(self.year_min_var.get()) year_max = int(self.year_max_var.get()) except: year_min, year_max = 2014, 2023 series = series.where((series >= year_min) & (series <= year_max), other=pd.NA) if self.numeric_missing_var.get() == '0': series = series.fillna(0) elif self.numeric_missing_var.get() == 'median': median_val = series.median() series = series.fillna(median_val if pd.notna(median_val) else 0) return series.astype('Int64') def perform_cleaning(self, df): try: self.log("✅ Начало очистки с выбранными параметрами...") ADMISSIONS_COLUMNS = [ 'Index', 'Education level', 'Degree', 'Tuition fees', 'ID', 'Branches of Science', 'Group of Professions', 'Year', 'Number of Applications', 'Number of Students' ] if len(df.columns) == len(ADMISSIONS_COLUMNS): df.columns = ADMISSIONS_COLUMNS numeric_cols = ['Number of Applications', 'Number of Students', 'ID'] categorical_cols = [ 'Education level', 'Degree', 'Tuition fees', 'Branches of Science', 'Group of Professions' ] year_col = 'Year' for col in numeric_cols: if col in df.columns: df[col] = self.clean_numeric_column(df[col]) self.log(f"✅ Обработан числовой столбец: {col}") for col in categorical_cols: if col in df.columns: df[col] = self.clean_categorical_column(df[col]) self.log(f"✅ Обработан категориальный столбец: {col}") if year_col in df.columns: df[year_col] = self.clean_year_column(df[year_col]) self.log(f"✅ Обработан временной столбец: {year_col}") df['Conversion'] = df['Number of Students'] / (df['Number of Applications'] + 1e-9) initial = len(df) df = df.drop_duplicates() if len(df) < initial: self.log(f"➖ Удалено {initial - len(df)} дубликатов") self.progress['value'] = 75 return df except Exception as e: raise RuntimeError(f"Ошибка очистки: {str(e)}") def clean_data(self): if not self.input_file_path: messagebox.showwarning("Ошибка", "Сначала выберите файл!") return try: self.cleaned_df = None self.anomalies_df = None self.progress['value'] = 0 self.status_label.config(text="Выполняется анализ...") self.log("Начало анализа с пользовательскими настройками") file_ext = os.path.splitext(self.input_file_path)[1].lower() self.log("Шаг 1/4: Загрузка данных...") df = self.load_data(file_ext) self.log("Шаг 2/4: Очистка с выбранными параметрами...") df = self.perform_cleaning(df) if self.auto_save_var.get(): output_path = self.get_output_path("_cleaned") self.log(f"Шаг 3/4: Сохранение в {os.path.basename(output_path)}") if file_ext in ['.xlsx', '.xls']: df.to_excel(output_path, index=False, engine='openpyxl') else: df.to_csv(output_path, sep=';', index=False, encoding='utf-8-sig') self.log(f"✅ Файл сохранён: {output_path}") self.cleaned_df = df self.progress['value'] = 100 self.status_label.config(text="Анализ завершён") self.log("✅ АНАЛИЗ ЗАВЕРШЁН!") self.export_btn.configure(state='normal') self.view_btn.configure(state='normal') self.anomalies_btn.configure(state='normal') self.save_anomalies_btn.configure(state='normal') messagebox.showinfo("Готово", "Анализ данных успешно завершён!") except Exception as e: self.log(f"❌ Ошибка: {str(e)}") messagebox.showerror("Ошибка", f"Произошла ошибка: {str(e)}") def create_visualizations(self): if self.cleaned_df is None: return None plots = {} temp_dir = tempfile.mkdtemp() try: df = self.cleaned_df df_2019_2023 = df[(df['Year'] >= 2019) & (df['Year'] <= 2023)] # 1. ТОП-5 групп профессий по годам if not df_2019_2023.empty: top5_by_year = ( df_2019_2023.groupby(['Year', 'Group of Professions'])['Number of Applications'] .sum() .reset_index() .sort_values(['Year', 'Number of Applications'], ascending=[True, False]) .groupby('Year') .head(5) ) if not top5_by_year.empty: all_groups = sorted(top5_by_year['Group of Professions'].unique()) group_pos_map = {g: i for i, g in enumerate(all_groups)} years = sorted(top5_by_year['Year'].unique()) bar_width = 0.8 / len(years) plt.figure(figsize=(12, 7)) for i, year in enumerate(years): year_data = top5_by_year[top5_by_year['Year'] == year] if not year_data.empty: indices = np.array([group_pos_map[g] for g in year_data['Group of Professions']]) bars = plt.bar(indices + i * bar_width, year_data['Number of Applications'], width=bar_width, label=str(year), alpha=0.8) for bar in bars: h = bar.get_height() if h > 0: plt.text(bar.get_x() + bar.get_width() / 2., h, f'{int(h):,}', ha='center', va='bottom', fontsize=8) plt.title('ТОП-5 востребованных групп профессий по годам (2019–2023)', fontsize=14, fontweight='bold') plt.xlabel('Группа профессий') plt.ylabel('Число заявок') plt.xticks(np.arange(len(all_groups)) + (len(years) - 1) * bar_width / 2, all_groups, rotation=45, ha='right') plt.legend(title='Год') plt.grid(axis='y', alpha=0.3) plt.tight_layout() path = os.path.join(temp_dir, 'top5_professions.png') plt.savefig(path, dpi=150) plt.close() plots['top5'] = path # 2. Доля платного приёма if not df_2019_2023.empty: paid_share = ( df_2019_2023.groupby('Group of Professions') .apply(lambda x: x[x['Tuition fees'] == 'Full pay / tuition fee']['Number of Applications'].sum() / (x['Number of Applications'].sum() + 1e-9)) .sort_values(ascending=False) ) if len(paid_share) > 0: if len(paid_share) > 8: top = paid_share.head(8) other = paid_share[8:].mean() labels = list(top.index) + ['Другие'] sizes = list(top.values) + [other] else: labels, sizes = paid_share.index, paid_share.values plt.figure(figsize=(8, 8)) wedges, texts, autotexts = plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90, shadow=True, explode=[0.05]*len(sizes)) for autotext in autotexts: autotext.set_color('white') autotext.set_fontweight('bold') plt.title('Доля платного приёма по группам профессий (2019–2023)', fontsize=14, fontweight='bold') plt.tight_layout() path = os.path.join(temp_dir, 'paid_share.png') plt.savefig(path, dpi=150) plt.close() plots['paid'] = path # 3. Динамика по ключевым отраслям target_branches = ['Engineering and Technology', 'Healthcare and Medicine', 'Education Sciences and Pedagogy'] df_branches = df[df['Branches of Science'].isin(target_branches)] if not df_branches.empty: yearly_branches = df_branches.groupby(['Branches of Science', 'Year'])['Number of Applications'].sum().reset_index() plt.figure(figsize=(12, 7)) colors = {'Engineering and Technology': 'steelblue', 'Healthcare and Medicine': 'crimson', 'Education Sciences and Pedagogy': 'forestgreen'} for branch in target_branches: branch_data = yearly_branches[yearly_branches['Branches of Science'] == branch] if not branch_data.empty: plt.plot(branch_data['Year'], branch_data['Number of Applications'], marker='o', linestyle='-', color=colors[branch], label=branch, linewidth=2) if len(branch_data) > 1: z = np.polyfit(branch_data['Year'], branch_data['Number of Applications'], 1) p = np.poly1d(z) plt.plot(branch_data['Year'], p(branch_data['Year']), "--", color=colors[branch], alpha=0.7) plt.title('Динамика заявок по ключевым отраслям (2014–2023)', fontsize=14, fontweight='bold') plt.xlabel('Год') plt.ylabel('Число заявок') plt.legend(title='Отрасль') plt.grid(True, alpha=0.3) plt.tight_layout() path = os.path.join(temp_dir, 'branches_trend.png') plt.savefig(path, dpi=150) plt.close() plots['branches'] = path # 4. Низкая конверсия (2022–2023) df_recent = df[(df['Year'] >= 2022) & (df['Year'] <= 2023)].copy() df_recent = df_recent[df_recent['Number of Applications'] > 0] if not df_recent.empty: top10 = df_recent.nlargest(10, 'Number of Applications') if len(top10) > 0: median_conv = top10['Conversion'].median() low_conv = top10[top10['Conversion'] < median_conv].copy() low_conv = low_conv.sort_values('Number of Applications', ascending=True) if not low_conv.empty: plt.figure(figsize=(10, 6)) bars = plt.barh(low_conv['Group of Professions'], low_conv['Number of Applications'], color='salmon', edgecolor='black', alpha=0.8) for bar in bars: width = bar.get_width() plt.text(width + max(low_conv['Number of Applications']) * 0.01, bar.get_y() + bar.get_height() / 2, f'{int(width):,}', va='center', fontweight='bold') plt.title('ТОП-10: Программы с высоким спросом и низкой конверсией (2022–2023)', fontsize=14, fontweight='bold') plt.xlabel('Число заявок') plt.ylabel('Группа профессий') plt.grid(axis='x', alpha=0.3) plt.tight_layout() path = os.path.join(temp_dir, 'low_conversion_top10.png') plt.savefig(path, dpi=150) plt.close() plots['conversion'] = path return plots, temp_dir except Exception as e: self.log(f"⚠️ Ошибка визуализации: {e}") return None, temp_dir def export_to_word(self): if self.cleaned_df is None: messagebox.showwarning("Ошибка", "Сначала выполните анализ данных!") return try: file_path = filedialog.asksaveasfilename( defaultextension=".docx", filetypes=[("Word documents", "*.docx"), ("All files", "*.*")], title="Сохранить отчёт в Word" ) if not file_path: return plots, temp_dir = self.create_visualizations() doc = Document() doc.add_heading('Аналитический отчёт по данным приёма', 0) doc.paragraphs[0].alignment = WD_ALIGN_PARAGRAPH.CENTER doc.add_heading('1. Параметры очистки', level=1) doc.add_paragraph(f"Числовые данные: пропуски → {self.numeric_missing_var.get()}, тип → {self.numeric_type_var.get()}") doc.add_paragraph(f"Категориальные данные: пропуски → «{self.categorical_missing_var.get()}»") doc.add_paragraph(f"Временные данные: диапазон лет {self.year_min_var.get()}–{self.year_max_var.get()}") doc.add_heading('2. Пример данных', level=1) sample = self.cleaned_df.head(5) table = doc.add_table(rows=1, cols=len(sample.columns)) table.style = 'Table Grid' for j, col in enumerate(sample.columns): table.cell(0, j).text = str(col) for i, row in sample.iterrows(): row_cells = table.add_row().cells for j, val in enumerate(row): row_cells[j].text = str(val) if pd.notna(val) else '' # Все графики sections = [ ('top5', 'ТОП-5 востребованных групп профессий по годам (2019–2023)'), ('paid', 'Доля платного приёма по группам профессий (2019–2023)'), ('branches', 'Динамика заявок по ключевым отраслям (2014–2023)'), ('conversion', 'ТОП-10: Программы с высоким спросом и низкой конверсией (2022–2023)') ] for key, title in sections: if plots and key in plots: doc.add_heading(title, level=1) doc.add_picture(plots[key], width=Inches(6)) doc.add_paragraph() doc.save(file_path) self.log(f"✅ Отчёт сохранён: {os.path.basename(file_path)}") if plots: for p in plots.values(): if os.path.exists(p): os.remove(p) os.rmdir(temp_dir) messagebox.showinfo("Успех", f"Отчёт сохранён!\n{file_path}") except Exception as e: self.log(f"❌ Ошибка экспорта: {e}") messagebox.showerror("Ошибка", f"Не удалось сохранить отчёт: {e}") def view_data(self): if self.cleaned_df is None: messagebox.showwarning("Ошибка", "Сначала выполните анализ данных!") return view_window = tk.Toplevel(self.root) view_window.title("Просмотр и сортировка данных") view_window.geometry("1000x600") view_window.configure(bg='#e6f3ff') search_frame = tk.Frame(view_window, bg='#e6f3ff', pady=10) search_frame.pack(fill='x', padx=10) tk.Label(search_frame, text="Поиск по Year:", bg='#e6f3ff', fg='black').pack(side='left') year_var = tk.StringVar() year_entry = ttk.Entry(search_frame, textvariable=year_var, width=15) year_entry.pack(side='left', padx=5) def search_by_year(): query = year_var.get().strip() if not query: update_treeview(self.cleaned_df) return try: year_int = int(query) filtered_df = self.cleaned_df[self.cleaned_df['Year'] == year_int] update_treeview(filtered_df) except ValueError: messagebox.showwarning("Ошибка", "Введите целое число") def reset_search(): year_var.set('') update_treeview(self.cleaned_df) tk.Button(search_frame, text="Поиск", command=search_by_year, bg='#d3d3d3', fg='black').pack(side='left', padx=5) tk.Button(search_frame, text="Сброс", command=reset_search, bg='#d3d3d3', fg='black').pack(side='left', padx=5) sort_frame = tk.Frame(view_window, bg='#e6f3ff', pady=10) sort_frame.pack(fill='x', padx=10) tk.Label(sort_frame, text="Сортировка по:", bg='#e6f3ff', fg='black').pack(side='left') sort_var = tk.StringVar() sort_combo = ttk.Combobox(sort_frame, textvariable=sort_var, values=list(self.cleaned_df.columns), state='readonly', width=25) sort_combo.pack(side='left', padx=5) def apply_sort(ascending): col = sort_var.get() if not col: return sorted_df = self.cleaned_df.sort_values(by=col, ascending=ascending) update_treeview(sorted_df) tk.Button(sort_frame, text="по возрастанию", command=lambda: apply_sort(True), bg='#d3d3d3', fg='black').pack(side='left', padx=5) tk.Button(sort_frame, text="по убыванию", command=lambda: apply_sort(False), bg='#d3d3d3', fg='black').pack(side='left', padx=5) tree_frame = tk.Frame(view_window, bg='white') tree_frame.pack(fill='both', expand=True, padx=10, pady=10) tree_scroll_y = ttk.Scrollbar(tree_frame, orient='vertical') tree_scroll_x = ttk.Scrollbar(tree_frame, orient='horizontal') tree = ttk.Treeview(tree_frame, columns=list(self.cleaned_df.columns), show='headings', yscrollcommand=tree_scroll_y.set, xscrollcommand=tree_scroll_x.set) tree.pack(side='left', fill='both', expand=True) tree_scroll_y.pack(side='right', fill='y') tree_scroll_x.pack(side='bottom', fill='x') tree_scroll_y.config(command=tree.yview) tree_scroll_x.config(command=tree.xview) for col in self.cleaned_df.columns: tree.heading(col, text=col) tree.column(col, width=100) def update_treeview(df): for item in tree.get_children(): tree.delete(item) for _, row in df.head(1000).iterrows(): values = [str(val) if pd.notna(val) else "" for val in row] tree.insert('', 'end', values=values) update_treeview(self.cleaned_df) def detect_anomalies(self, df): try: df_anomalies = df.copy() actions = [] year_mask = (df_anomalies['Year'] < 2000) | (df_anomalies['Year'] > 2030) year_anomalies = df_anomalies[year_mask].index actions.extend([(idx, "Некорректный год") for idx in year_anomalies]) if 'Number of Applications' in df_anomalies.columns: q999 = df_anomalies['Number of Applications'].quantile(0.999) app_anomalies = df_anomalies[df_anomalies['Number of Applications'] > q999].index actions.extend([(idx, "Очень высокое число заявок") for idx in app_anomalies]) numeric_features = ['Year', 'Number of Applications', 'Number of Students'] available_features = [f for f in numeric_features if f in df_anomalies.columns] if len(available_features) >= 2: X = df_anomalies[available_features].fillna(0) scaler = StandardScaler() X_scaled = scaler.fit_transform(X) model = IsolationForest(contamination=0.05, random_state=42) preds = model.fit_predict(X_scaled) ml_anomalies = df_anomalies[preds == -1].index actions.extend([(idx, "Аномалия (ML)") for idx in ml_anomalies]) action_dict = dict(actions) df_anomalies['Статус'] = df_anomalies.index.map(action_dict).fillna("Нормальная запись") result_df = df_anomalies[df_anomalies['Статус'] != "Нормальная запись"] return result_df except Exception as e: self.log(f"⚠️ Ошибка обнаружения аномалий: {e}") empty_df = df.copy().iloc[0:0] empty_df['Статус'] = "" return empty_df def view_anomalies(self): if self.cleaned_df is None: messagebox.showwarning("Ошибка", "Сначала выполните анализ данных!") return try: self.anomalies_df = self.detect_anomalies(self.cleaned_df) if self.anomalies_df.empty: messagebox.showinfo("Информация", "Аномалии не обнаружены.") return view_window = tk.Toplevel(self.root) view_window.title("Выбросы и аномалии") view_window.geometry("1000x600") view_window.configure(bg='#e6f3ff') search_frame = tk.Frame(view_window, bg='#e6f3ff', pady=10) search_frame.pack(fill='x', padx=10) tk.Label(search_frame, text="Поиск по Year:", bg='#e6f3ff', fg='black').pack(side='left') year_var = tk.StringVar() year_entry = ttk.Entry(search_frame, textvariable=year_var, width=15) year_entry.pack(side='left', padx=5) def search_by_year(): query = year_var.get().strip() if not query: update_treeview(self.anomalies_df) return try: year_int = int(query) filtered_df = self.anomalies_df[self.anomalies_df['Year'] == year_int] update_treeview(filtered_df) except ValueError: messagebox.showwarning("Ошибка", "Введите целое число") def reset_search(): year_var.set('') update_treeview(self.anomalies_df) tk.Button(search_frame, text="Поиск", command=search_by_year, bg='#d3d3d3', fg='black').pack(side='left', padx=5) tk.Button(search_frame, text="Сброс", command=reset_search, bg='#d3d3d3', fg='black').pack(side='left', padx=5) sort_frame = tk.Frame(view_window, bg='#e6f3ff', pady=10) sort_frame.pack(fill='x', padx=10) tk.Label(sort_frame, text="Сортировка по:", bg='#e6f3ff', fg='black').pack(side='left') sort_var = tk.StringVar() sort_combo = ttk.Combobox(sort_frame, textvariable=sort_var, values=list(self.anomalies_df.columns), state='readonly', width=25) sort_combo.pack(side='left', padx=5) def apply_sort(ascending): col = sort_var.get() if not col: return sorted_df = self.anomalies_df.sort_values(by=col, ascending=ascending) update_treeview(sorted_df) tk.Button(sort_frame, text="по возрастанию", command=lambda: apply_sort(True), bg='#d3d3d3', fg='black').pack(side='left', padx=5) tk.Button(sort_frame, text="по убыванию", command=lambda: apply_sort(False), bg='#d3d3d3', fg='black').pack(side='left', padx=5) tree_frame = tk.Frame(view_window, bg='white') tree_frame.pack(fill='both', expand=True, padx=10, pady=10) tree_scroll_y = ttk.Scrollbar(tree_frame, orient='vertical') tree_scroll_x = ttk.Scrollbar(tree_frame, orient='horizontal') tree = ttk.Treeview(tree_frame, columns=list(self.anomalies_df.columns), show='headings', yscrollcommand=tree_scroll_y.set, xscrollcommand=tree_scroll_x.set) tree.pack(side='left', fill='both', expand=True) tree_scroll_y.pack(side='right', fill='y') tree_scroll_x.pack(side='bottom', fill='x') tree_scroll_y.config(command=tree.yview) tree_scroll_x.config(command=tree.xview) for col in self.anomalies_df.columns: tree.heading(col, text=col) tree.column(col, width=100) def update_treeview(df): for item in tree.get_children(): tree.delete(item) for _, row in df.head(1000).iterrows(): values = [str(val) if pd.notna(val) else "" for val in row] tree.insert('', 'end', values=values) update_treeview(self.anomalies_df) except Exception as e: self.log(f"❌ Ошибка просмотра аномалий: {e}") messagebox.showerror("Ошибка", f"Не удалось отобразить аномалии: {e}") def save_anomalies_report(self): if self.anomalies_df is None or self.anomalies_df.empty: messagebox.showwarning("Ошибка", "Сначала обнаружьте аномалии!") return try: file_path = filedialog.asksaveasfilename( defaultextension=".xlsx", filetypes=[("Excel files", "*.xlsx"), ("All files", "*.*")], title="Сохранить отчет по аномалиям" ) if not file_path: return self.anomalies_df.to_excel(file_path, index=False, engine='openpyxl') self.log(f"✅ Отчет по аномалиям сохранён: {os.path.basename(file_path)}") messagebox.showinfo("Успех", f"Отчет сохранён!\n{file_path}") except Exception as e: self.log(f"❌ Ошибка сохранения отчёта: {e}") messagebox.showerror("Ошибка", f"Не удалось сохранить отчёт: {e}") def main(): try: root = tk.Tk() app = BookDataCleanerApp(root) root.mainloop() except Exception as e: messagebox.showerror("Критическая ошибка", f"Не удалось запустить приложение: {str(e)}") raise if __name__ == "__main__": main()