182 lines
7.5 KiB
Python
182 lines
7.5 KiB
Python
#!/usr/bin/env python
|
|
"""
|
|
combine_thesis_html.py - Combines all HTML files in the Thesis materials directory
|
|
into the main thesis document
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
def combine_html_files():
|
|
# Directory containing the HTML files
|
|
thesis_dir = "/Users/home/YandexDisk/TECHNOLYCEUM/ict/Year/2025/ai/ai7/ai7-m3/scheduler_bots/Thesis materials"
|
|
|
|
# Main file to append content to
|
|
main_file = "Thesis_ Intelligent School Schedule Management System.html"
|
|
main_file_path = os.path.join(thesis_dir, main_file)
|
|
|
|
# Get all HTML files in the directory
|
|
html_files = [f for f in os.listdir(thesis_dir) if f.endswith('.html')]
|
|
|
|
print(f"Found {len(html_files)} HTML files:")
|
|
for i, f in enumerate(html_files, 1):
|
|
print(f" {i}. {f}")
|
|
|
|
# Read the main file content
|
|
with open(main_file_path, 'r', encoding='utf-8') as f:
|
|
main_content = f.read()
|
|
|
|
# Parse the main file with BeautifulSoup
|
|
soup_main = BeautifulSoup(main_content, 'html.parser')
|
|
|
|
# Find the body element in the main file
|
|
main_body = soup_main.find('body')
|
|
if not main_body:
|
|
# If no body tag, create one
|
|
main_body = soup_main.new_tag('body')
|
|
soup_main.html.insert(0, main_body) if soup_main.html else soup_main.insert(0, main_body)
|
|
|
|
# Add a separator before adding new content
|
|
separator = soup_main.new_tag('hr')
|
|
separator['style'] = 'margin: 40px 0; border: 2px solid #4a6fa5;'
|
|
main_body.append(separator)
|
|
|
|
# Add a heading for the appended content
|
|
appendix_heading = soup_main.new_tag('h2')
|
|
appendix_heading.string = 'Additional Thesis Materials'
|
|
appendix_heading['style'] = 'color: #2c3e50; margin-top: 40px; border-bottom: 2px solid #4a6fa5; padding-bottom: 10px;'
|
|
main_body.append(appendix_heading)
|
|
|
|
# Process each additional HTML file
|
|
for filename in html_files:
|
|
if filename == main_file: # Skip the main file
|
|
continue
|
|
|
|
print(f"Processing {filename}...")
|
|
|
|
file_path = os.path.join(thesis_dir, filename)
|
|
|
|
# Read the additional file content
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
additional_content = f.read()
|
|
|
|
# Parse the additional file
|
|
soup_additional = BeautifulSoup(additional_content, 'html.parser')
|
|
|
|
# Create a section for this file
|
|
section_div = soup_main.new_tag('div')
|
|
section_div['class'] = 'additional-section'
|
|
section_div['style'] = 'margin: 30px 0; padding: 20px; border: 1px solid #ddd; border-radius: 8px; background-color: #fafafa;'
|
|
|
|
# Add a heading for this section
|
|
section_heading = soup_main.new_tag('h3')
|
|
section_heading.string = f'Content from: {filename}'
|
|
section_heading['style'] = 'color: #4a6fa5; margin-top: 0;'
|
|
section_div.append(section_heading)
|
|
|
|
# Get body content from the additional file
|
|
additional_body = soup_additional.find('body')
|
|
if additional_body:
|
|
# Copy child elements from the additional body to our section
|
|
for child in additional_body.children:
|
|
if child.name: # Only copy actual elements, not text nodes
|
|
section_div.append(child.extract())
|
|
else:
|
|
# If no body tag, add the whole content
|
|
section_div.append(soup_additional)
|
|
|
|
# Append the section to the main body
|
|
main_body.append(section_div)
|
|
|
|
# Write the combined content back to the main file
|
|
with open(main_file_path, 'w', encoding='utf-8') as f:
|
|
f.write(str(soup_main.prettify()))
|
|
|
|
print(f"All HTML files have been combined into {main_file}")
|
|
print(f"Combined file saved at: {main_file_path}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# Check if BeautifulSoup is available
|
|
try:
|
|
import bs4
|
|
combine_html_files()
|
|
except ImportError:
|
|
print("BeautifulSoup4 library is required for this script.")
|
|
print("Install it with: pip install beautifulsoup4")
|
|
|
|
# Create a simple version without BeautifulSoup
|
|
print("Creating a basic combination without BeautifulSoup...")
|
|
|
|
thesis_dir = "/Users/home/YandexDisk/TECHNOLYCEUM/ict/Year/2025/ai/ai7/ai7-m3/scheduler_bots/Thesis materials"
|
|
main_file = "Thesis_ Intelligent School Schedule Management System.html"
|
|
main_file_path = os.path.join(thesis_dir, main_file)
|
|
|
|
# Get all HTML files in the directory
|
|
html_files = [f for f in os.listdir(thesis_dir) if f.endswith('.html')]
|
|
|
|
# Read the main file content
|
|
with open(main_file_path, 'r', encoding='utf-8') as f:
|
|
main_content = f.read()
|
|
|
|
# Find the closing body tag to insert additional content
|
|
body_close_pos = main_content.rfind('</body>')
|
|
if body_close_pos == -1:
|
|
# If no closing body tag, find the closing html tag
|
|
html_close_pos = main_content.rfind('</html>')
|
|
if html_close_pos != -1:
|
|
insert_pos = html_close_pos
|
|
else:
|
|
# If no closing html tag, append at the end
|
|
insert_pos = len(main_content)
|
|
else:
|
|
insert_pos = body_close_pos
|
|
|
|
# Prepare the additional content
|
|
additional_content = '\n\n<!-- Additional Thesis Materials -->\n<hr style="margin: 40px 0; border: 2px solid #4a6fa5;">\n<h2 style="color: #2c3e50; margin-top: 40px; border-bottom: 2px solid #4a6fa5; padding-bottom: 10px;">Additional Thesis Materials</h2>\n\n'
|
|
|
|
# Process each additional HTML file
|
|
for filename in html_files:
|
|
if filename == main_file: # Skip the main file
|
|
continue
|
|
|
|
print(f"Processing {filename}...")
|
|
|
|
file_path = os.path.join(thesis_dir, filename)
|
|
|
|
# Read the additional file content
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
# Remove HTML and HEAD sections to only get body content
|
|
# Remove doctype
|
|
content = re.sub(r'<!DOCTYPE[^>]*>', '', content, flags=re.IGNORECASE)
|
|
|
|
# Remove html tags
|
|
content = re.sub(r'<html[^>]*>|</html>', '', content, flags=re.IGNORECASE)
|
|
|
|
# Remove head section
|
|
content = re.sub(r'<head[^>]*>.*?</head>', '', content, flags=re.DOTALL | re.IGNORECASE)
|
|
|
|
# Remove opening and closing body tags
|
|
content = re.sub(r'<body[^>]*>|</body>', '', content, flags=re.IGNORECASE)
|
|
|
|
# Add section wrapper
|
|
section_content = f'\n<div class="additional-section" style="margin: 30px 0; padding: 20px; border: 1px solid #ddd; border-radius: 8px; background-color: #fafafa;">\n'
|
|
section_content += f'<h3 style="color: #4a6fa5; margin-top: 0;">Content from: {filename}</h3>\n'
|
|
section_content += content
|
|
section_content += '\n</div>\n'
|
|
|
|
additional_content += section_content
|
|
|
|
# Insert the additional content
|
|
combined_content = main_content[:insert_pos] + additional_content + main_content[insert_pos:]
|
|
|
|
# Write the combined content back to the main file
|
|
with open(main_file_path, 'w', encoding='utf-8') as f:
|
|
f.write(combined_content)
|
|
|
|
print(f"All HTML files have been combined into {main_file}")
|
|
print(f"Combined file saved at: {main_file_path}") |