ai7-m3/scheduler_bots/database_backup.py

    def process_csv_with_teacher_mapping(self, csv_file):
        """Process CSV with teacher-subject mapping based on positional order"""
        if not os.path.exists(csv_file):
            return False

        with open(csv_file, 'r', encoding='utf-8') as file:
            reader = csv.reader(file)
            rows = list(reader)

        # Identify header row - look for the row containing "ФИО" (full name) or similar indicators
        header_idx = None
        for i, row in enumerate(rows):
            for cell in row:
                if "ФИО" in str(cell) or "фио" in str(cell).lower() or "Ф.И.О." in str(cell) or "ф.и.о." in str(cell):
                    header_idx = i
                    break
            if header_idx is not None:
                break

        if header_idx is None:
            # Check if this file contains class and name columns that identify it as a student data file
            # Even if the header doesn't contain ФИО, we might still be able to identify student data
            has_class_indicators = any(
                any(indicator in str(cell).lower() for cell in row for indicator in ['класс', 'class'])
                for row in rows[:min(len(rows), 10)]  # Check first 10 rows
            )

            has_name_indicators = any(
                any(indicator in str(cell).lower() for cell in row for indicator in ['имя', 'name', 'фамилия', 'surname'])
                for row in rows[:min(len(rows), 10)]  # Check first 10 rows
            )

            if has_class_indicators and has_name_indicators:
                # Try to find the header row by looking for class and name indicators
                for i, row in enumerate(rows):
                    if any(indicator in str(cell).lower() for cell in row for indicator in ['класс', 'class']) and \
                       any(indicator in str(cell).lower() for cell in row for indicator in ['имя', 'name', 'фамилия', 'surname']):
                        header_idx = i
                        break

            if header_idx is None:
                print(f"Skipping {csv_file} - does not appear to be student data with ФИО/class columns")
                return False

        # Find teacher-subject mappings in the first 0-15 rows before the header
        teacher_subject_map = {}

        # Build a mapping of subject names in the header row
        header_row = rows[header_idx]
        header_subjects = {}
        for col_idx, subject_name in enumerate(header_row):
            subject_name = str(subject_name).strip()
            if (subject_name and
                subject_name.lower() not in ['ф.и.о.', 'фио', 'класс', 'номер', 'сортировка', 'шкафчика', 'локера'] and
                subject_name.strip() != "" and
                "ф.и.о" not in subject_name.lower() and
                "сортировка" not in subject_name.lower() and
                "номер" not in subject_name.lower() and
                "№" not in subject_name):
                header_subjects[col_idx] = subject_name  # Map column index to subject name

        # First, try to find teachers in the rows before the header
        for i in range(min(15, header_idx)):  # Check first 15 rows before header
            current_row = rows[i]

            # Process all cells in the row to find teacher names and their adjacent context
            for j, cell_value in enumerate(current_row):
                cell_str = str(cell_value).strip()

                # Check if this cell is a likely teacher name
                if self._is_likely_teacher_name(cell_str):
                    # Look for context on the left (department) and right (subject)
                    left_context = ""
                    right_context = ""

                    # Get left neighbor (department)
                    if j > 0 and j-1 < len(current_row):
                        left_context = str(current_row[j-1]).strip()

                    # Get right neighbor (subject)
                    if j < len(current_row) - 1:
                        right_context = str(current_row[j+1]).strip()

                    # Try to determine the subject based on adjacency
                    matched_subject = None

                    # First priority: right neighbor if it matches a subject in the header
                    if right_context and j+1 in header_subjects:
                        matched_subject = header_subjects[j+1]
                    # Second priority: use left context if it semantically relates to a teacher
                    elif left_context and any(keyword in left_context.lower() for keyword in ['учитель', 'teacher', 'кафедра', 'department']):
                        # If left context indicates a department, look for subject to the right of teacher
                        if j+1 in header_subjects:
                            matched_subject = header_subjects[j+1]
                        # If no subject to the right, try to map by position
                        elif j in header_subjects:
                            matched_subject = header_subjects[j]
                    # Third priority: try to map by position
                    elif j in header_subjects:
                        matched_subject = header_subjects[j]

                    # Only add if we don't have a better teacher name for this subject yet
                    if matched_subject and (matched_subject not in teacher_subject_map or
                                            'Default Teacher for' in teacher_subject_map.get(matched_subject, '')):
                        teacher_subject_map[matched_subject] = cell_str

                # If the cell contains multiple names (separated by newlines), process each separately
                elif '\n' in cell_str or '\\n' in cell_str:
                    cell_parts = [part.strip() for part in cell_str.replace('\\n', '\n').split('\n') if part.strip()]
                    for part in cell_parts:
                        if self._is_likely_teacher_name(part):
                            # Look for context on the left (department) and right (subject)
                            left_context = ""
                            right_context = ""

                            # Get left neighbor (department)
                            if j > 0 and j-1 < len(current_row):
                                left_context = str(current_row[j-1]).strip()

                            # Get right neighbor (subject)
                            if j < len(current_row) - 1:
                                right_context = str(current_row[j+1]).strip()

                            # Try to determine the subject based on adjacency
                            matched_subject = None

                            # First priority: right neighbor if it matches a subject in the header
                            if right_context and j+1 in header_subjects:
                                matched_subject = header_subjects[j+1]
                            # Second priority: use left context if it semantically relates to a teacher
                            elif left_context and any(keyword in left_context.lower() for keyword in ['учитель', 'teacher', 'кафедра', 'department']):
                                # If left context indicates a department, look for subject to the right of teacher
                                if j+1 in header_subjects:
                                    matched_subject = header_subjects[j+1]
                                # If no subject to the right, try to map by position
                                elif j in header_subjects:
                                    matched_subject = header_subjects[j]
                            # Third priority: try to map by position
                            elif j in header_subjects:
                                matched_subject = header_subjects[j]

                            # Only add if we don't have a better teacher name for this subject yet
                            if matched_subject and (matched_subject not in teacher_subject_map or
                                                    'Default Teacher for' in teacher_subject_map.get(matched_subject, '')):
                                teacher_subject_map[matched_subject] = part

        # Additional teacher-subject mapping: scan the rows immediately before the header for teacher names in subject columns
        # In many CSV files, teacher names appear in the same rows as subject headers
        for i in range(max(0, header_idx - 5), header_idx):  # Check 5 rows before header
            current_row = rows[i]
            for j, cell_value in enumerate(current_row):
                cell_str = str(cell_value).strip()

                # If cell contains a likely teacher name and corresponds to a subject column
                if self._is_likely_teacher_name(cell_str) and j in header_subjects:
                    subject_name = header_subjects[j]
                    # Only add if we don't have a better teacher name for this subject yet
                    if (subject_name not in teacher_subject_map or
                        'Default Teacher for' in teacher_subject_map.get(subject_name, '')):
                        teacher_subject_map[subject_name] = cell_str

        # Additional validation: Remove any teacher-subject mappings that seem incorrect
        validated_teacher_subject_map = {}
        for subject, teacher in teacher_subject_map.items():
            # Only add to validated map if teacher name passes all checks
            if self._is_likely_teacher_name(teacher):
                validated_teacher_subject_map[subject] = teacher
            else:
                print(f"Warning: Invalid teacher name '{teacher}' detected for subject '{subject}', skipping...")

        teacher_subject_map = validated_teacher_subject_map

        # Process each student row
        for student_row in rows[header_idx + 1:]:
            # Determine the structure dynamically based on the header
            class_col_idx = None
            name_col_idx = None

            # Find the index of the class column (usually called "Класс")
            for idx, header in enumerate(header_row):
                if "Класс" in str(header) or "класс" in str(header) or "Class" in str(header) or "class" in str(header):
                    class_col_idx = idx
                    break

            # Find the index of the name column (usually called "ФИО")
            for idx, header in enumerate(header_row):
                if "ФИО" in str(header) or "ф.и.о." in str(header).lower() or "name" in str(header).lower():
                    name_col_idx = idx
                    break

            # If we couldn't find the columns properly, skip this row
            if class_col_idx is None or name_col_idx is None:
                continue

            # Check if this row has valid data in the expected columns
            if (len(student_row) > max(class_col_idx, name_col_idx) and
                student_row[class_col_idx].strip() and  # class name exists
                student_row[name_col_idx].strip() and   # student name exists
                self._is_valid_student_record_by_cols(student_row, class_col_idx, name_col_idx)):

                name = student_row[name_col_idx].strip()  # Name column
                class_name = student_row[class_col_idx].strip()  # Class column

                # Insert student into the database
                self.cursor.execute(
                    "INSERT OR IGNORE INTO students (class_name, full_name) VALUES (?, ?)",
                    (class_name, name)
                )

                # Get the student_id for this student
                self.cursor.execute("SELECT student_id FROM students WHERE full_name = ? AND class_name = ?", (name, class_name))
                student_id_result = self.cursor.fetchone()
                if student_id_result is None:
                    continue
                student_id = student_id_result[0]

                # Process schedule data for this student
                # Go through each column to find subject and group info
                for col_idx, cell_value in enumerate(student_row):
                    if cell_value and col_idx < len(header_row):
                        # Get the subject from the header
                        subject_header = header_row[col_idx] if col_idx < len(header_row) else ""

                        # Skip columns that don't contain schedule information
                        if (col_idx == 0 or col_idx == 1 or col_idx == 2 or col_idx == class_col_idx or col_idx == name_col_idx or  # skip metadata cols
                            "сортировка" in subject_header.lower() or
                            "номер" in subject_header.lower() or
                            "шкафчика" in subject_header.lower() or
                            "локера" in subject_header.lower()):
                            continue

                        # Extract group information from the cell
                        group_assignment = cell_value.strip()

                        if group_assignment and group_assignment.lower() != "nan" and group_assignment != "-" and group_assignment != "":
                            # Find the teacher associated with this subject
                            subject_name = str(subject_header).strip()
                            teacher_name = teacher_subject_map.get(subject_name, f"Default Teacher for {subject_name}")

                            # Insert the entities into their respective tables first
                            # Then get their IDs to create the schedule entry
                            self._process_schedule_entry_with_teacher_mapping(
                                student_id, group_assignment, subject_name, teacher_name
                            )

        self.conn.commit()
        return True