#!/usr/bin/env python """ consolidate_theses.py - Consolidates all HTML thesis files into a single HTML file with clear separation between different documents """ import os import re def consolidate_html_theses(): # Define the parent directory containing HTML files parent_dir = "/Users/home/YandexDisk/TECHNOLYCEUM/ict/Year/2025/ai/ai7/ai7-m3" # List of HTML thesis files to consolidate html_files = [ "Lesson_ SQLite Database Implementation.html", "Presentaion_School Schedule Assistant Bot _ Student Project.html", "Professional_Thesis_Scheduler_Bot.html", "Scheduler Bot_ Telegram & CSV Database.html", "Student Database Search System _ Beginner's Guide.html", "Thesis_ Intelligent School Schedule Management System_23_Jan_2026.html", "Thesis_AI7_Building_A_ Scheduler_Bot_A Student Project.html" ] # Output file output_file = "consolidated_theses.html" # Start building the consolidated HTML consolidated_html = """ Consolidated Thesis Documents

Consolidated Thesis Collection

Table of Contents

{i}. {doc_title}

""" # Process each HTML file for i, filename in enumerate(html_files, 1): filepath = os.path.join(parent_dir, filename) if not os.path.exists(filepath): print(f"File not found: {filename}") continue print(f"Processing {filename}...") # Add document separator and header doc_title = os.path.splitext(filename)[0].replace('_', ' ') consolidated_html += f"""

{i}. {doc_title}

Source file: {filename}

""" # Read the HTML file and extract content try: with open(filepath, 'r', encoding='utf-8') as f: content = f.read() # Remove HTML and HEAD tags, keeping only BODY content # First remove the DOCTYPE declaration if present content = re.sub(r']*>', '', content, flags=re.IGNORECASE) # Remove HTML tags and everything outside the body body_start = content.find('', body_start) + 1 body_end = content.rfind('') if body_end != -1: content = content[body_start:body_end] # If no body tags found, try to remove head section if body_start == -1 or body_end == -1: head_match = re.search(r']*>.*?', content, re.DOTALL | re.IGNORECASE) if head_match: content = content.replace(head_match.group(0), '') # Remove html tags if present content = re.sub(r']*>|', '', content, flags=re.IGNORECASE) # Add the content to the consolidated HTML consolidated_html += content except Exception as e: print(f"Error processing {filename}: {str(e)}") consolidated_html += f"

Error reading this document: {str(e)}

" # Close the document content div consolidated_html += """

""" # Add footer consolidated_html += """ """ # Write the consolidated HTML to file with open(output_file, 'w', encoding='utf-8') as f: f.write(consolidated_html) print(f"Consolidated HTML thesis document created: {output_file}") print(f"Included {len([f for f in html_files if os.path.exists(os.path.join(parent_dir, f))])} documents in the consolidated file") if __name__ == "__main__": consolidate_html_theses()