ai7-m3/scheduler_bots/database_fresh.py

#!/usr/bin/env python
"""
database.py - School schedule database (normalized version)
Creates normalized tables and extracts from CSV with proper relationships
"""

import sqlite3
import csv
import os
import sys
import re

class SchoolScheduleDB:
    def __init__(self, db_name='school_schedule.db'):
        self.conn = sqlite3.connect(db_name)
        self.cursor = self.conn.cursor()
        # Initialize database tables
        self.create_tables()

    def create_tables(self):
        """Create normalized tables with proper relationships"""
        # Teachers table
        self.cursor.execute("""
            CREATE TABLE IF NOT EXISTS teachers (
                teacher_id INTEGER PRIMARY KEY AUTOINCREMENT,
                name TEXT UNIQUE NOT NULL,
                email TEXT,
                phone TEXT
            )
        """)

        # Subjects table
        self.cursor.execute("""
            CREATE TABLE IF NOT EXISTS subjects (
                subject_id INTEGER PRIMARY KEY AUTOINCREMENT,
                name TEXT UNIQUE NOT NULL,
                description TEXT
            )
        """)

        # Days table
        self.cursor.execute("""
            CREATE TABLE IF NOT EXISTS days (
                day_id INTEGER PRIMARY KEY AUTOINCREMENT,
                name TEXT UNIQUE NOT NULL  -- e.g., Monday, Tuesday, etc.
            )
        """)

        # Periods table - with proper unique constraint
        self.cursor.execute("""
            CREATE TABLE IF NOT EXISTS periods (
                period_id INTEGER PRIMARY KEY AUTOINCREMENT,
                period_number INTEGER,
                start_time TEXT,
                end_time TEXT,
                UNIQUE(period_number, start_time, end_time)
            )
        """)

        # Groups table
        self.cursor.execute("""
            CREATE TABLE IF NOT EXISTS groups (
                group_id INTEGER PRIMARY KEY AUTOINCREMENT,
                name TEXT UNIQUE NOT NULL,
                description TEXT,
                class_name TEXT
            )
        """)

        # Students table
        self.cursor.execute("""
            CREATE TABLE IF NOT EXISTS students (
                student_id INTEGER PRIMARY KEY AUTOINCREMENT,
                class_name TEXT,
                full_name TEXT NOT NULL
            )
        """)

        # Schedule table with foreign key relationships
        self.cursor.execute("""
            CREATE TABLE IF NOT EXISTS schedule (
                entry_id INTEGER PRIMARY KEY AUTOINCREMENT,
                student_id INTEGER,
                subject_id INTEGER,
                teacher_id INTEGER,
                day_id INTEGER,
                period_id INTEGER,
                group_id INTEGER,
                FOREIGN KEY (student_id) REFERENCES students(student_id),
                FOREIGN KEY (subject_id) REFERENCES subjects(subject_id),
                FOREIGN KEY (teacher_id) REFERENCES teachers(teacher_id),
                FOREIGN KEY (day_id) REFERENCES days(day_id),
                FOREIGN KEY (period_id) REFERENCES periods(period_id),
                FOREIGN KEY (group_id) REFERENCES groups(group_id)
            )
        """)

        self.conn.commit()

    def populate_periods_table(self):
        """Populate the periods table with standard school periods"""
        period_times = {
            '1': ('09:00', '09:40'),
            '2': ('10:00', '10:40'),
            '3': ('11:00', '11:40'),
            '4': ('11:50', '12:30'),
            '5': ('12:40', '13:20'),
            '6': ('13:30', '14:10'),
            '7': ('14:20', '15:00'),
            '8': ('15:20', '16:00'),
            '9': ('16:15', '16:55'),
            '10': ('17:05', '17:45'),
            '11': ('17:55', '18:35'),
            '12': ('18:45', '19:20'),
            '13': ('19:20', '20:00')
        }

        for period_num, (start_time, end_time) in period_times.items():
            self.cursor.execute(
                "INSERT OR IGNORE INTO periods (period_number, start_time, end_time) VALUES (?, ?, ?)",
                (int(period_num), start_time, end_time)
            )

        # Add days of the week
        days_of_week = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
        for day in days_of_week:
            self.cursor.execute("INSERT OR IGNORE INTO days (name) VALUES (?)", (day,))

        self.conn.commit()

    def update_database_from_csv(self, auto_update=True):
        """Automatically update database from specific CSV files in the sample_data directory"""
        sample_data_dir = "sample_data"

        if not os.path.exists(sample_data_dir):
            print(f"Directory '{sample_data_dir}' not found.")
            return

        # Get all CSV files and filter out the schedule template and sheet files
        all_csv_files = [f for f in os.listdir(sample_data_dir) if f.endswith('.csv')]

        # Keep only the actual student distribution files (not the sheets)
        csv_files = []
        for filename in all_csv_files:
            if 'first_sheet' not in filename and 'last_sheet' not in filename and 'template' not in filename:
                csv_files.append(filename)

        if not csv_files:
            print(f"No student data CSV files found in '{sample_data_dir}' directory.")
            return

        print(f"Found {len(csv_files)} student data CSV file(s):")
        for i, filename in enumerate(csv_files, 1):
            print(f"  {i}. {filename}")

        if auto_update:
            print("\nAuto-updating database with all student data CSV files...")
            files_to_update = csv_files
        else:
            response = input("\nUpdate database with CSV files? (yes/no): ").lower()

            if response not in ['yes', 'y', 'да']:
                print("Skipping database update.")
                return

            print(f"\n0. Update all files")

            try:
                selection = input(f"\nSelect file(s) to update (0 for all, or comma-separated numbers like 1,2,3): ")

                if selection.strip() == '0':
                    # Update all files
                    files_to_update = csv_files
                else:
                    # Parse user selection
                    indices = [int(x.strip()) - 1 for x in selection.split(',')]
                    files_to_update = [csv_files[i] for i in indices if 0 <= i < len(csv_files)]

                if not files_to_update:
                    print("No valid selections made.")
                    return
            except ValueError:
                print("Invalid input. Please enter numbers separated by commas or '0' for all files.")
                return

        # Populate the periods and days tables first
        self.populate_periods_table()

        print(f"\nUpdating database with {len(files_to_update)} file(s):")
        for filename in files_to_update:
            print(f"  - {filename}")

            csv_path = os.path.join(sample_data_dir, filename)
            print(f"Processing {csv_path}...")

            self.process_csv_with_teacher_mapping(csv_path)

        print("Database updated successfully with selected CSV data.")

    def process_csv_with_teacher_mapping(self, csv_file):
        """Process CSV with teacher-subject mapping based on positional order"""
        if not os.path.exists(csv_file):
            return False

        with open(csv_file, 'r', encoding='utf-8') as file:
            reader = csv.reader(file)
            rows = list(reader)

        # Identify header row - look for the row containing "ФИО" (full name) or similar indicators
        header_idx = None
        for i, row in enumerate(rows):
            for cell in row:
                if "ФИО" in str(cell) or "фио" in str(cell).lower() or "Ф.И.О." in str(cell) or "ф.и.о." in str(cell):
                    header_idx = i
                    break
            if header_idx is not None:
                break

        if header_idx is None:
            # Check if this file contains class and name columns that identify it as a student data file
            # Even if the header doesn't contain ФИО, we might still be able to identify student data
            has_class_indicators = any(
                any(indicator in str(cell).lower() for cell in row for indicator in ['класс', 'class'])
                for row in rows[:min(len(rows), 10)]  # Check first 10 rows
            )

            has_name_indicators = any(
                any(indicator in str(cell).lower() for cell in row for indicator in ['имя', 'name', 'фамилия', 'surname'])
                for row in rows[:min(len(rows), 10)]  # Check first 10 rows
            )

            if has_class_indicators and has_name_indicators:
                # Try to find the header row by looking for class and name indicators
                for i, row in enumerate(rows):
                    if any(indicator in str(cell).lower() for cell in row for indicator in ['класс', 'class']) and \
                       any(indicator in str(cell).lower() for cell in row for indicator in ['имя', 'name', 'фамилия', 'surname']):
                        header_idx = i
                        break

            if header_idx is None:
                print(f"Skipping {csv_file} - does not appear to be student data with ФИО/class columns")
                return False

        # Find teacher-subject mappings in the first 0-15 rows before the header
        teacher_subject_map = {}

        # Build a mapping of subject names in the header row
        header_row = rows[header_idx]
        header_subjects = {}
        for col_idx, subject_name in enumerate(header_row):
            subject_name = str(subject_name).strip()
            if (subject_name and
                subject_name.lower() not in ['ф.и.о.', 'фио', 'класс', 'номер', 'сортировка', 'шкафчика', 'локера'] and
                subject_name.strip() != "" and
                "ф.и.о" not in subject_name.lower() and
                "сортировка" not in subject_name.lower() and
                "номер" not in subject_name.lower() and
                "№" not in subject_name):
                header_subjects[col_idx] = subject_name  # Map column index to subject name

        # First, try to find teachers in the rows before the header
        for i in range(min(15, header_idx)):  # Check first 15 rows before header
            current_row = rows[i]

            # Process all cells in the row to find teacher names and their adjacent context
            for j, cell_value in enumerate(current_row):
                cell_str = str(cell_value).strip()

                # Check if this cell is a likely teacher name
                if self._is_likely_teacher_name(cell_str):
                    # Look for context on the left (department) and right (subject)
                    left_context = ""
                    right_context = ""

                    # Get left neighbor (department)
                    if j > 0 and j-1 < len(current_row):
                        left_context = str(current_row[j-1]).strip()

                    # Get right neighbor (subject)
                    if j < len(current_row) - 1:
                        right_context = str(current_row[j+1]).strip()

                    # Try to determine the subject based on adjacency
                    matched_subject = None

                    # First priority: right neighbor if it matches a subject in the header
                    if right_context and j+1 in header_subjects:
                        matched_subject = header_subjects[j+1]
                    # Second priority: use left context if it semantically relates to a teacher
                    elif left_context and any(keyword in left_context.lower() for keyword in ['учитель', 'teacher', 'кафедра', 'department']):
                        # If left context indicates a department, look for subject to the right of teacher
                        if j+1 in header_subjects:
                            matched_subject = header_subjects[j+1]
                        # If no subject to the right, try to map by position
                        elif j in header_subjects:
                            matched_subject = header_subjects[j]
                    # Third priority: try to map by position
                    elif j in header_subjects:
                        matched_subject = header_subjects[j]

                    # Only add if we don't have a better teacher name for this subject yet
                    if matched_subject and (matched_subject not in teacher_subject_map or
                                            'Default Teacher for' in teacher_subject_map.get(matched_subject, '')):
                        teacher_subject_map[matched_subject] = cell_str

                # If the cell contains multiple names (separated by newlines), process each separately
                elif '\n' in cell_str or '\\n' in cell_str:
                    cell_parts = [part.strip() for part in cell_str.replace('\\n', '\n').split('\n') if part.strip()]
                    for part in cell_parts:
                        if self._is_likely_teacher_name(part):
                            # Look for context on the left (department) and right (subject)
                            left_context = ""
                            right_context = ""

                            # Get left neighbor (department)
                            if j > 0 and j-1 < len(current_row):
                                left_context = str(current_row[j-1]).strip()

                            # Get right neighbor (subject)
                            if j < len(current_row) - 1:
                                right_context = str(current_row[j+1]).strip()

                            # Try to determine the subject based on adjacency
                            matched_subject = None

                            # First priority: right neighbor if it matches a subject in the header
                            if right_context and j+1 in header_subjects:
                                matched_subject = header_subjects[j+1]
                            # Second priority: use left context if it semantically relates to a teacher
                            elif left_context and any(keyword in left_context.lower() for keyword in ['учитель', 'teacher', 'кафедра', 'department']):
                                # If left context indicates a department, look for subject to the right of teacher
                                if j+1 in header_subjects:
                                    matched_subject = header_subjects[j+1]
                                # If no subject to the right, try to map by position
                                elif j in header_subjects:
                                    matched_subject = header_subjects[j]
                            # Third priority: try to map by position
                            elif j in header_subjects:
                                matched_subject = header_subjects[j]

                            # Only add if we don't have a better teacher name for this subject yet
                            if matched_subject and (matched_subject not in teacher_subject_map or
                                                    'Default Teacher for' in teacher_subject_map.get(matched_subject, '')):
                                teacher_subject_map[matched_subject] = part

        # Additional teacher-subject mapping: scan the rows immediately before the header for teacher names in subject columns
        # In many CSV files, teacher names appear in the same rows as subject headers
        for i in range(max(0, header_idx - 5), header_idx):  # Check 5 rows before header
            current_row = rows[i]
            for j, cell_value in enumerate(current_row):
                cell_str = str(cell_value).strip()

                # If cell contains a likely teacher name and corresponds to a subject column
                if self._is_likely_teacher_name(cell_str) and j in header_subjects:
                    subject_name = header_subjects[j]
                    # Only add if we don't have a better teacher name for this subject yet
                    if (subject_name not in teacher_subject_map or
                        'Default Teacher for' in teacher_subject_map.get(subject_name, '')):
                        teacher_subject_map[subject_name] = cell_str

        # Additional validation: Remove any teacher-subject mappings that seem incorrect
        validated_teacher_subject_map = {}
        for subject, teacher in teacher_subject_map.items():
            # Only add to validated map if teacher name passes all checks
            if self._is_likely_teacher_name(teacher):
                validated_teacher_subject_map[subject] = teacher
            else:
                print(f"Warning: Invalid teacher name '{teacher}' detected for subject '{subject}', skipping...")

        teacher_subject_map = validated_teacher_subject_map

        # Process each student row
        for student_row in rows[header_idx + 1:]:
            # Determine the structure dynamically based on the header
            class_col_idx = None
            name_col_idx = None

            # Find the index of the class column (usually called "Класс")
            for idx, header in enumerate(header_row):
                if "Класс" in str(header) or "класс" in str(header) or "Class" in str(header) or "class" in str(header):
                    class_col_idx = idx
                    break

            # Find the index of the name column (usually called "ФИО")
            for idx, header in enumerate(header_row):
                if "ФИО" in str(header) or "ф.и.о." in str(header).lower() or "name" in str(header).lower():
                    name_col_idx = idx
                    break

            # If we couldn't find the columns properly, skip this row
            if class_col_idx is None or name_col_idx is None:
                continue

            # Check if this row has valid data in the expected columns
            if (len(student_row) > max(class_col_idx, name_col_idx) and
                student_row[class_col_idx].strip() and  # class name exists
                student_row[name_col_idx].strip() and   # student name exists
                self._is_valid_student_record_by_cols(student_row, class_col_idx, name_col_idx)):

                name = student_row[name_col_idx].strip()  # Name column
                class_name = student_row[class_col_idx].strip()  # Class column

                # Insert student into the database
                self.cursor.execute(
                    "INSERT OR IGNORE INTO students (class_name, full_name) VALUES (?, ?)",
                    (class_name, name)
                )

                # Get the student_id for this student
                self.cursor.execute("SELECT student_id FROM students WHERE full_name = ? AND class_name = ?", (name, class_name))
                student_id_result = self.cursor.fetchone()
                if student_id_result is None:
                    continue
                student_id = student_id_result[0]

                # Process schedule data for this student
                # Go through each column to find subject and group info
                for col_idx, cell_value in enumerate(student_row):
                    if cell_value and col_idx < len(header_row):
                        # Get the subject from the header
                        subject_header = header_row[col_idx] if col_idx < len(header_row) else ""

                        # Skip columns that don't contain schedule information
                        if (col_idx == 0 or col_idx == 1 or col_idx == 2 or col_idx == class_col_idx or col_idx == name_col_idx or  # skip metadata cols
                            "сортировка" in subject_header.lower() or
                            "номер" in subject_header.lower() or
                            "шкафчика" in subject_header.lower() or
                            "локера" in subject_header.lower()):
                            continue

                        # Extract group information from the cell
                        group_assignment = cell_value.strip()

                        if group_assignment and group_assignment.lower() != "nan" and group_assignment != "-" and group_assignment != "":
                            # Find the teacher associated with this subject
                            subject_name = str(subject_header).strip()
                            teacher_name = teacher_subject_map.get(subject_name, f"Default Teacher for {subject_name}")

                            # Insert the entities into their respective tables first
                            # Then get their IDs to create the schedule entry
                            self._process_schedule_entry_with_teacher_mapping(
                                student_id, group_assignment, subject_name, teacher_name
                            )

        self.conn.commit()
        return True

    def _is_valid_student_record_by_cols(self, row, class_col_idx, name_col_idx):
        """Check if a row represents a valid student record based on specific columns"""
        # A valid student record should have:
        # - Non-empty class name in the class column
        # - Non-empty student name in the name column

        if len(row) <= max(class_col_idx, name_col_idx):
            return False

        class_name = row[class_col_idx].strip() if len(row) > class_col_idx else ""
        student_name = row[name_col_idx].strip() if len(row) > name_col_idx else ""

        # Check if the class name looks like an actual class (contains a number followed by a letter)
        class_pattern = r'^\d+[А-ЯA-Z]$'  # e.g., 6А, 11А, 4B
        if re.match(class_pattern, class_name):
            return bool(student_name and student_name != class_name)  # Ensure name exists and is different from class

        # If not matching class pattern, check if the name field is not just another class-like value
        name_pattern = r'^\d+[А-ЯA-Z]$'  # This would indicate it's probably a class, not a name
        if re.match(name_pattern, student_name):
            return False  # This row has a class in the name field, so not valid

        return bool(class_name and student_name and class_name != student_name)

    def _process_schedule_entry_with_teacher_mapping(self, student_id, group_info, subject_info, teacher_name):
        """Process individual schedule entries with explicit teacher mapping and insert into normalized tables"""
        # Clean up the inputs
        subject_name = subject_info.strip() if subject_info.strip() else "General Class"
        group_assignment = group_info.strip()

        # Only proceed if we have valid data
        if subject_name and group_assignment and group_assignment.lower() != "nan" and group_assignment != "-" and group_assignment != "":
            # Insert subject if not exists and get its ID
            self.cursor.execute("INSERT OR IGNORE INTO subjects (name) VALUES (?)", (subject_name,))
            self.cursor.execute("SELECT subject_id FROM subjects WHERE name = ?", (subject_name,))
            subject_id = self.cursor.fetchone()[0]

            # Insert teacher if not exists and get its ID
            # Use the teacher name as is, without default creation if not found
            self.cursor.execute("INSERT OR IGNORE INTO teachers (name) VALUES (?)", (teacher_name,))
            self.cursor.execute("SELECT teacher_id FROM teachers WHERE name = ?", (teacher_name,))
            teacher_result = self.cursor.fetchone()
            if teacher_result:
                teacher_id = teacher_result[0]
            else:
                # Fallback to a default teacher if the extracted name is invalid
                default_teacher = "Неизвестный преподаватель"
                self.cursor.execute("INSERT OR IGNORE INTO teachers (name) VALUES (?)", (default_teacher,))
                self.cursor.execute("SELECT teacher_id FROM teachers WHERE name = ?", (default_teacher,))
                teacher_id = self.cursor.fetchone()[0]

            # Use a default day for now (in a real system, we'd extract this from the schedule)
            # For now, we'll randomly assign to a day of the week
            import random
            days_list = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday"]
            selected_day = random.choice(days_list)
            self.cursor.execute("INSERT OR IGNORE INTO days (name) VALUES (?)", (selected_day,))
            self.cursor.execute("SELECT day_id FROM days WHERE name = ?", (selected_day,))
            day_id = self.cursor.fetchone()[0]

            # Use a default period - for now we'll use period 1, but in a real system
            # we would need to extract this from the CSV if available
            self.cursor.execute("SELECT period_id FROM periods WHERE period_number = 1 LIMIT 1")
            period_result = self.cursor.fetchone()
            if period_result:
                period_id = period_result[0]
            else:
                # Fallback if no periods were inserted
                self.cursor.execute("SELECT period_id FROM periods LIMIT 1")
                period_id = self.cursor.fetchone()[0]

            # Clean the group name to separate it from student data
            group_name = self._clean_group_name(group_assignment)
            self.cursor.execute("INSERT OR IGNORE INTO groups (name) VALUES (?)", (group_name,))
            self.cursor.execute("SELECT group_id FROM groups WHERE name = ?", (group_name,))
            group_id = self.cursor.fetchone()[0]

            # Insert the schedule entry
            self.cursor.execute("""
                INSERT OR IGNORE INTO schedule (student_id, subject_id, teacher_id, day_id, period_id, group_id)
                VALUES (?, ?, ?, ?, ?, ?)
            """, (student_id, subject_id, teacher_id, day_id, period_id, group_id))

    def _clean_group_name(self, raw_group_data):
        """Extract clean group name from potentially mixed student/group data"""
        # Remove potential student names from the group data
        # Group names typically contain numbers, class identifiers, or specific activity names
        cleaned = raw_group_data.strip()

        # If the group data looks like it contains a student name pattern,
        # we'll try to extract just the group identifier part
        if re.match(r'^\d+[А-ЯA-Z]', cleaned):
            # This looks like a class designation, return as is
            return cleaned

        # If the group data contains common group indicators, return as is
        group_indicators = ['кл', 'class', 'club', 'track', 'group', 'module', '-']
        if any(indicator in cleaned.lower() for indicator in group_indicators):
            return cleaned

        # If the group data looks like a subject-identifier pattern, return as is
        subject_indicators = ['ICT', 'English', 'Math', 'Physics', 'Chemistry', 'Biology', 'Science']
        if any(indicator in cleaned for indicator in subject_indicators):
            return cleaned

        # If none of the above conditions match, return a generic group name
        return f"Group_{hash(cleaned) % 10000}"

    def _is_likely_teacher_name(self, text):
        """Check if the text is likely to be a teacher name"""
        if not text or len(text.strip()) < 5:  # Require minimum length for a name
            return False

        text = text.strip()

        # Common non-name values that appear in the CSV
        common_non_names = ['-', 'nan', 'нет', 'нету', 'отсутствует', 'учитель', 'teacher', '', 'Е4 Е5', 'E4 E5', 'группа', 'group']
        if text.lower() in common_non_names:
            return False

        # Exclusion patterns for non-teacher entries
        exclusion_patterns = [
            r'^[А-ЯЁ]\d+\s+[А-ЯЁ]\d+$',      # E4 E5 pattern
            r'^[A-Z]\d+\s+[A-Z]\d+$',       # English groups
            r'.*[Tt]rack.*',                 # Track identifiers
            r'.*[Gg]roup.*',                 # Group identifiers
            r'.*\d+[А-ЯA-Z]\d*$',           # Number-letter combos
            r'^[А-ЯЁA-Z].*\d+',              # Text ending with digits
            r'.*[Cc]lub.*',                  # Club identifiers
        ]

        for pattern in exclusion_patterns:
            if re.match(pattern, text, re.IGNORECASE):
                return False

        # Positive patterns for teacher names
        teacher_patterns = [
            r'^[А-ЯЁ][а-яё]+\s+[А-ЯЁ]\.\s*[А-ЯЁ]\.$',     # Иванов А.А.
            r'^[А-ЯЁ]\.\s*[А-ЯЁ]\.\s+[А-ЯЁ][а-яё]+$',     # А.А. Иванов
            r'^[А-ЯЁ][а-яё]+\s+[А-ЯЁ][а-яё]+\s+[А-ЯЁ][а-яё]+$', # Full name
            r'^[A-Z][a-z]+\s+[A-Z][a-z]+$',               # John Smith
            r'^[A-Z][a-z]+\s+[A-Z]\.\s*[A-Z]\.$',        # Smith J.J.
            r'^[А-ЯЁ][а-яё]+\s+[А-ЯЁ][а-яё]+$',          # Russian names without patronymic
        ]

        for pattern in teacher_patterns:
            if re.match(pattern, text.strip()):
                return True

        # Additional check: if it looks like a proper name (with capital letters and min length)
        # and doesn't match exclusion patterns
        name_parts = text.split()
        if len(name_parts) >= 2:
            # At least two parts (first name + last name)
            # Check if they start with capital letters
            if all(part[0].isupper() for part in name_parts if len(part) > 1):
                return True

        return False

    def _is_likely_subject_label(self, text):
        """Check if text is likely a subject label like 'Матем.', 'Информ.', 'Англ.яз', etc."""
        if not text or len(text) < 2:
            return False

        # Common Russian abbreviations for subjects
        subject_patterns = [
            'Матем.', 'Информ.', 'Англ.яз', 'Русск.яз', 'Физика', 'Химия', 'Биол', 'История',
            'Общество', 'География', 'Литер', 'Физкульт', 'Технотрек', 'Лидерство',
            'Спорт. клуб', 'ОРКСЭ', 'Китайск', 'Немецк', 'Француз', 'Speaking club', 'Maths',
            'ICT', 'Geography', 'Physics', 'Robotics', 'Culinary', 'Science', 'AI Core', 'VR/AR',
            'CyberSafety', 'Business', 'Design', 'Prototype', 'MediaCom', 'Science', 'Robotics',
            'Culinary', 'AI Core', 'VR/AR', 'CyberSafety', 'Business', 'Design', 'Prototype',
            'MediaCom', 'Robotics Track', 'Culinary Track', 'Science Track', 'AI Core Track',
            'VR/AR Track', 'CyberSafety Track', 'Business Track', 'Design Track', 'Prototype Track',
            'MediaCom Track', 'Math', 'Algebra', 'Geometry', 'Calculus', 'Statistics', 'Coding',
            'Programming', 'Algorithm', 'Logic', 'Robotics', 'Physical Education', 'PE', 'Sports',
            'Swimming', 'Fitness', 'Gymnastics', 'Climbing', 'Games', 'Art', 'Music', 'Dance',
            'Karate', 'Judo', 'Martial Arts', 'Chess', 'Leadership', 'Entrepreneurship'
        ]

        text_clean = text.strip().lower()
        for pattern in subject_patterns:
            if pattern.lower() in text_clean:
                return True

        # Also check for specific subject names found in the data
        specific_subjects = ['матем.', 'информ.', 'англ.яз', 'русск.яз', 'каб.', 'business', 'maths',
                             'speaking', 'ict', 'geography', 'physics', 'robotics', 'science', 'ai core',
                             'vr/ar', 'cybersafety', 'design', 'prototype', 'mediacom', 'culinary',
                             'physical education', 'pe', 'sports', 'swimming', 'fitness', 'gymnastics',
                             'climbing', 'games', 'art', 'music', 'dance', 'karate', 'chess', 'leadership']
        for subj in specific_subjects:
            if subj in text_clean:
                return True

        return False

    def _find_matching_subject_in_header_from_list(self, subject_label, header_subjects, header_row):
        """Find the matching full subject name in the header based on the label"""
        if not subject_label:
            return None

        # Look for the best match in the header subjects
        subject_label_lower = subject_label.lower().replace('.', '').replace('яз', 'язык')

        # Direct match first
        for col_idx, full_subj in header_subjects:
            if subject_label_lower in full_subj.lower() or full_subj.lower() in subject_label_lower:
                return full_subj

        # If no direct match, try to find by partial matching in the whole header row
        for i, header_item in enumerate(header_row):
            if subject_label_lower in str(header_item).lower() or str(header_item).lower() in subject_label_lower:
                return str(header_item).strip()

        # Try more general matching - if label contains common abbreviations
        for col_idx, full_subj in header_subjects:
            full_lower = full_subj.lower()
            if ('матем' in subject_label_lower and 'матем' in full_lower) or \
               ('информ' in subject_label_lower and 'информ' in full_lower) or \
               ('англ' in subject_label_lower and 'англ' in full_lower) or \
               ('русск' in subject_label_lower and 'русск' in full_lower) or \
               ('физик' in subject_label_lower and 'физик' in full_lower) or \
               ('хим' in subject_label_lower and 'хим' in full_lower) or \
               ('биол' in subject_label_lower and 'биол' in full_lower) or \
               ('истор' in subject_label_lower and 'истор' in full_lower) or \
               ('общ' in subject_label_lower and 'общ' in full_lower) or \
               ('географ' in subject_label_lower and 'географ' in full_lower):
                return full_subj

        return None

    def find_student(self, name_query):
        """Search for students by name"""
        self.cursor.execute("""
            SELECT s.full_name, s.class_name
            FROM students s
            WHERE s.full_name LIKE ?
            LIMIT 10
        """, (f'%{name_query}%',))

        return self.cursor.fetchall()

    def get_current_class(self, student_name, current_day, current_time):
        """Find student's current class"""
        self.cursor.execute("""
            SELECT sub.name, t.name, p.start_time, p.end_time
            FROM schedule sch
            JOIN students s ON sch.student_id = s.student_id
            JOIN subjects sub ON sch.subject_id = sub.subject_id
            JOIN teachers t ON sch.teacher_id = t.teacher_id
            JOIN days d ON sch.day_id = d.day_id
            JOIN periods p ON sch.period_id = p.period_id
            JOIN groups g ON sch.group_id = g.group_id
            WHERE s.full_name = ?
            AND d.name = ?
            AND p.start_time <= ?
            AND p.end_time >= ?
        """, (student_name, current_day, current_time, current_time))

        return self.cursor.fetchone()

    def close(self):
        """Close database connection"""
        self.conn.close()

# Main execution - just setup database
if __name__ == "__main__":
    db = SchoolScheduleDB()
    # Check if auto-update flag is passed as argument
    auto_update = len(sys.argv) > 1 and sys.argv[1] == '--auto'
    db.update_database_from_csv(auto_update=auto_update)
    db.close()