ai7-m3/scheduler_bots/combine_thesis_html.py

#!/usr/bin/env python
"""
combine_thesis_html.py - Combines all HTML files in the Thesis materials directory
into the main thesis document
"""

import os
import re
from bs4 import BeautifulSoup


def combine_html_files():
    # Directory containing the HTML files
    thesis_dir = "/Users/home/YandexDisk/TECHNOLYCEUM/ict/Year/2025/ai/ai7/ai7-m3/scheduler_bots/Thesis materials"

    # Main file to append content to
    main_file = "Thesis_ Intelligent School Schedule Management System.html"
    main_file_path = os.path.join(thesis_dir, main_file)

    # Get all HTML files in the directory
    html_files = [f for f in os.listdir(thesis_dir) if f.endswith('.html')]

    print(f"Found {len(html_files)} HTML files:")
    for i, f in enumerate(html_files, 1):
        print(f"  {i}. {f}")

    # Read the main file content
    with open(main_file_path, 'r', encoding='utf-8') as f:
        main_content = f.read()

    # Parse the main file with BeautifulSoup
    soup_main = BeautifulSoup(main_content, 'html.parser')

    # Find the body element in the main file
    main_body = soup_main.find('body')
    if not main_body:
        # If no body tag, create one
        main_body = soup_main.new_tag('body')
        soup_main.html.insert(0, main_body) if soup_main.html else soup_main.insert(0, main_body)

    # Add a separator before adding new content
    separator = soup_main.new_tag('hr')
    separator['style'] = 'margin: 40px 0; border: 2px solid #4a6fa5;'
    main_body.append(separator)

    # Add a heading for the appended content
    appendix_heading = soup_main.new_tag('h2')
    appendix_heading.string = 'Additional Thesis Materials'
    appendix_heading['style'] = 'color: #2c3e50; margin-top: 40px; border-bottom: 2px solid #4a6fa5; padding-bottom: 10px;'
    main_body.append(appendix_heading)

    # Process each additional HTML file
    for filename in html_files:
        if filename == main_file:  # Skip the main file
            continue

        print(f"Processing {filename}...")

        file_path = os.path.join(thesis_dir, filename)

        # Read the additional file content
        with open(file_path, 'r', encoding='utf-8') as f:
            additional_content = f.read()

        # Parse the additional file
        soup_additional = BeautifulSoup(additional_content, 'html.parser')

        # Create a section for this file
        section_div = soup_main.new_tag('div')
        section_div['class'] = 'additional-section'
        section_div['style'] = 'margin: 30px 0; padding: 20px; border: 1px solid #ddd; border-radius: 8px; background-color: #fafafa;'

        # Add a heading for this section
        section_heading = soup_main.new_tag('h3')
        section_heading.string = f'Content from: {filename}'
        section_heading['style'] = 'color: #4a6fa5; margin-top: 0;'
        section_div.append(section_heading)

        # Get body content from the additional file
        additional_body = soup_additional.find('body')
        if additional_body:
            # Copy child elements from the additional body to our section
            for child in additional_body.children:
                if child.name:  # Only copy actual elements, not text nodes
                    section_div.append(child.extract())
        else:
            # If no body tag, add the whole content
            section_div.append(soup_additional)

        # Append the section to the main body
        main_body.append(section_div)

    # Write the combined content back to the main file
    with open(main_file_path, 'w', encoding='utf-8') as f:
        f.write(str(soup_main.prettify()))

    print(f"All HTML files have been combined into {main_file}")
    print(f"Combined file saved at: {main_file_path}")


if __name__ == "__main__":
    # Check if BeautifulSoup is available
    try:
        import bs4
        combine_html_files()
    except ImportError:
        print("BeautifulSoup4 library is required for this script.")
        print("Install it with: pip install beautifulsoup4")

        # Create a simple version without BeautifulSoup
        print("Creating a basic combination without BeautifulSoup...")

        thesis_dir = "/Users/home/YandexDisk/TECHNOLYCEUM/ict/Year/2025/ai/ai7/ai7-m3/scheduler_bots/Thesis materials"
        main_file = "Thesis_ Intelligent School Schedule Management System.html"
        main_file_path = os.path.join(thesis_dir, main_file)

        # Get all HTML files in the directory
        html_files = [f for f in os.listdir(thesis_dir) if f.endswith('.html')]

        # Read the main file content
        with open(main_file_path, 'r', encoding='utf-8') as f:
            main_content = f.read()

        # Find the closing body tag to insert additional content
        body_close_pos = main_content.rfind('</body>')
        if body_close_pos == -1:
            # If no closing body tag, find the closing html tag
            html_close_pos = main_content.rfind('</html>')
            if html_close_pos != -1:
                insert_pos = html_close_pos
            else:
                # If no closing html tag, append at the end
                insert_pos = len(main_content)
        else:
            insert_pos = body_close_pos

        # Prepare the additional content
        additional_content = '\n\n<!-- Additional Thesis Materials -->\n<hr style="margin: 40px 0; border: 2px solid #4a6fa5;">\n<h2 style="color: #2c3e50; margin-top: 40px; border-bottom: 2px solid #4a6fa5; padding-bottom: 10px;">Additional Thesis Materials</h2>\n\n'

        # Process each additional HTML file
        for filename in html_files:
            if filename == main_file:  # Skip the main file
                continue

            print(f"Processing {filename}...")

            file_path = os.path.join(thesis_dir, filename)

            # Read the additional file content
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()

            # Remove HTML and HEAD sections to only get body content
            # Remove doctype
            content = re.sub(r'<!DOCTYPE[^>]*>', '', content, flags=re.IGNORECASE)

            # Remove html tags
            content = re.sub(r'<html[^>]*>|</html>', '', content, flags=re.IGNORECASE)

            # Remove head section
            content = re.sub(r'<head[^>]*>.*?</head>', '', content, flags=re.DOTALL | re.IGNORECASE)

            # Remove opening and closing body tags
            content = re.sub(r'<body[^>]*>|</body>', '', content, flags=re.IGNORECASE)

            # Add section wrapper
            section_content = f'\n<div class="additional-section" style="margin: 30px 0; padding: 20px; border: 1px solid #ddd; border-radius: 8px; background-color: #fafafa;">\n'
            section_content += f'<h3 style="color: #4a6fa5; margin-top: 0;">Content from: {filename}</h3>\n'
            section_content += content
            section_content += '\n</div>\n'

            additional_content += section_content

        # Insert the additional content
        combined_content = main_content[:insert_pos] + additional_content + main_content[insert_pos:]

        # Write the combined content back to the main file
        with open(main_file_path, 'w', encoding='utf-8') as f:
            f.write(combined_content)

        print(f"All HTML files have been combined into {main_file}")
        print(f"Combined file saved at: {main_file_path}")