202 lines
6.5 KiB
Python
202 lines
6.5 KiB
Python
#!/usr/bin/env python
|
|
"""
|
|
consolidate_theses.py - Consolidates all HTML thesis files into a single HTML file
|
|
with clear separation between different documents
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
|
|
|
|
def consolidate_html_theses():
|
|
# Define the parent directory containing HTML files
|
|
parent_dir = "/Users/home/YandexDisk/TECHNOLYCEUM/ict/Year/2025/ai/ai7/ai7-m3"
|
|
|
|
# List of HTML thesis files to consolidate
|
|
html_files = [
|
|
"Lesson_ SQLite Database Implementation.html",
|
|
"Presentaion_School Schedule Assistant Bot _ Student Project.html",
|
|
"Professional_Thesis_Scheduler_Bot.html",
|
|
"Scheduler Bot_ Telegram & CSV Database.html",
|
|
"Student Database Search System _ Beginner's Guide.html",
|
|
"Thesis_ Intelligent School Schedule Management System_23_Jan_2026.html",
|
|
"Thesis_AI7_Building_A_ Scheduler_Bot_A Student Project.html"
|
|
]
|
|
|
|
# Output file
|
|
output_file = "consolidated_theses.html"
|
|
|
|
# Start building the consolidated HTML
|
|
consolidated_html = """<!DOCTYPE html>
|
|
<html lang="en">
|
|
<head>
|
|
<meta charset="UTF-8">
|
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
<title>Consolidated Thesis Documents</title>
|
|
<style>
|
|
body {
|
|
font-family: Arial, sans-serif;
|
|
margin: 20px;
|
|
background-color: #f9f9f9;
|
|
line-height: 1.6;
|
|
}
|
|
.document-separator {
|
|
page-break-before: always;
|
|
border-top: 3px solid #333;
|
|
margin: 30px 0;
|
|
}
|
|
.document-header {
|
|
background-color: #e9ecef;
|
|
padding: 15px;
|
|
border-radius: 5px;
|
|
margin-bottom: 20px;
|
|
border-left: 4px solid #007bff;
|
|
}
|
|
.document-title {
|
|
color: #2c3e50;
|
|
font-size: 24px;
|
|
margin: 0;
|
|
}
|
|
.document-source {
|
|
color: #6c757d;
|
|
font-size: 14px;
|
|
margin-top: 5px;
|
|
}
|
|
.document-content {
|
|
background-color: white;
|
|
padding: 20px;
|
|
border-radius: 5px;
|
|
box-shadow: 0 2px 5px rgba(0,0,0,0.1);
|
|
margin-bottom: 30px;
|
|
}
|
|
.toc {
|
|
background-color: #f8f9fa;
|
|
padding: 20px;
|
|
border-radius: 5px;
|
|
margin-bottom: 30px;
|
|
border-left: 4px solid #28a745;
|
|
}
|
|
.toc h2 {
|
|
color: #2c3e50;
|
|
margin-top: 0;
|
|
}
|
|
.toc ul {
|
|
list-style-type: decimal;
|
|
padding-left: 20px;
|
|
}
|
|
.toc li {
|
|
margin-bottom: 8px;
|
|
}
|
|
.toc a {
|
|
text-decoration: none;
|
|
color: #007bff;
|
|
}
|
|
.toc a:hover {
|
|
text-decoration: underline;
|
|
}
|
|
h1 {
|
|
color: #343a40;
|
|
border-bottom: 2px solid #007bff;
|
|
padding-bottom: 10px;
|
|
}
|
|
.footer {
|
|
text-align: center;
|
|
margin-top: 30px;
|
|
padding: 15px;
|
|
color: #6c757d;
|
|
font-size: 12px;
|
|
border-top: 1px solid #dee2e6;
|
|
}
|
|
</style>
|
|
</head>
|
|
<body>
|
|
<h1>Consolidated Thesis Collection</h1>
|
|
<div class="toc">
|
|
<h2>Table of Contents</h2>
|
|
<ul>
|
|
"""
|
|
|
|
# Add links to each document in the TOC
|
|
for i, filename in enumerate(html_files, 1):
|
|
doc_title = os.path.splitext(filename)[0].replace('_', ' ')
|
|
consolidated_html += f' <li><a href="#doc-{i}">{i}. {doc_title}</a></li>\n'
|
|
|
|
# Close TOC section
|
|
consolidated_html += """ </ul>
|
|
</div>
|
|
"""
|
|
|
|
# Process each HTML file
|
|
for i, filename in enumerate(html_files, 1):
|
|
filepath = os.path.join(parent_dir, filename)
|
|
|
|
if not os.path.exists(filepath):
|
|
print(f"File not found: {filename}")
|
|
continue
|
|
|
|
print(f"Processing {filename}...")
|
|
|
|
# Add document separator and header
|
|
doc_title = os.path.splitext(filename)[0].replace('_', ' ')
|
|
consolidated_html += f""" <div class="document-separator" id="doc-{i}"></div>
|
|
<div class="document-header">
|
|
<h2 class="document-title">{i}. {doc_title}</h2>
|
|
<div class="document-source">Source file: {filename}</div>
|
|
</div>
|
|
<div class="document-content">
|
|
"""
|
|
|
|
# Read the HTML file and extract content
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
# Remove HTML and HEAD tags, keeping only BODY content
|
|
# First remove the DOCTYPE declaration if present
|
|
content = re.sub(r'<!DOCTYPE[^>]*>', '', content, flags=re.IGNORECASE)
|
|
|
|
# Remove HTML tags and everything outside the body
|
|
body_start = content.find('<body')
|
|
if body_start != -1:
|
|
body_start = content.find('>', body_start) + 1
|
|
body_end = content.rfind('</body>')
|
|
if body_end != -1:
|
|
content = content[body_start:body_end]
|
|
|
|
# If no body tags found, try to remove head section
|
|
if body_start == -1 or body_end == -1:
|
|
head_match = re.search(r'<head[^>]*>.*?</head>', content, re.DOTALL | re.IGNORECASE)
|
|
if head_match:
|
|
content = content.replace(head_match.group(0), '')
|
|
|
|
# Remove html tags if present
|
|
content = re.sub(r'<html[^>]*>|</html>', '', content, flags=re.IGNORECASE)
|
|
|
|
# Add the content to the consolidated HTML
|
|
consolidated_html += content
|
|
|
|
except Exception as e:
|
|
print(f"Error processing {filename}: {str(e)}")
|
|
consolidated_html += f"<p><em>Error reading this document: {str(e)}</em></p>"
|
|
|
|
# Close the document content div
|
|
consolidated_html += """ </div>
|
|
"""
|
|
|
|
# Add footer
|
|
consolidated_html += """ <div class="footer">
|
|
<p>Consolidated from multiple thesis documents | Generated automatically</p>
|
|
</div>
|
|
</body>
|
|
</html>"""
|
|
|
|
# Write the consolidated HTML to file
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
f.write(consolidated_html)
|
|
|
|
print(f"Consolidated HTML thesis document created: {output_file}")
|
|
print(f"Included {len([f for f in html_files if os.path.exists(os.path.join(parent_dir, f))])} documents in the consolidated file")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
consolidate_html_theses() |