This commit is contained in:
2026-02-05 10:15:09 +03:00
parent 2427fce842
commit 67241a5ed0
33 changed files with 13147 additions and 154 deletions

View File

@@ -0,0 +1,117 @@
#!/usr/bin/env python
"""
consolidate_csv.py - Consolidates all CSV files in sample_data directory into a single CSV
with sheet identifiers to distinguish between different original files
"""
import csv
import os
def consolidate_csv_files():
sample_data_dir = "sample_data"
output_file = "consolidated_data.csv"
if not os.path.exists(sample_data_dir):
print(f"Directory '{sample_data_dir}' not found.")
return
# Get all CSV files and filter out the schedule template and sheet files
all_csv_files = [f for f in os.listdir(sample_data_dir) if f.endswith('.csv')]
# Keep only the actual student distribution files (not the sheets)
csv_files = []
for filename in all_csv_files:
if 'first_sheet' not in filename and 'last_sheet' not in filename and 'template' not in filename:
csv_files.append(filename)
if not csv_files:
print(f"No student data CSV files found in '{sample_data_dir}' directory.")
return
print(f"Found {len(csv_files)} student data CSV file(s):")
for i, filename in enumerate(csv_files, 1):
print(f" {i}. {filename}")
consolidated_rows = []
for sheet_num, filename in enumerate(csv_files, 1):
csv_path = os.path.join(sample_data_dir, filename)
print(f"Processing {csv_path}...")
with open(csv_path, 'r', encoding='utf-8') as file:
reader = csv.reader(file)
rows = list(reader)
# Add a column to indicate which sheet this data came from
for row_idx, row in enumerate(rows):
# Create a new row with the sheet number as the first column
new_row = [sheet_num, filename] + row
consolidated_rows.append(new_row)
# Write consolidated data to a new CSV file
with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile)
# Write header
writer.writerow(['Sheet_Number', 'Original_File', 'Data_Columns'])
# Write all rows
for row in consolidated_rows:
writer.writerow(row)
print(f"Consolidated data written to {output_file}")
print(f"Total rows in consolidated file: {len(consolidated_rows)}")
def consolidate_csv_files_simple():
"""
Creates a simpler consolidated CSV file with Sheet_Number and Original_File columns
"""
sample_data_dir = "sample_data"
output_file = "consolidated_data_simple.csv"
if not os.path.exists(sample_data_dir):
print(f"Directory '{sample_data_dir}' not found.")
return
# Get all CSV files
all_csv_files = [f for f in os.listdir(sample_data_dir) if f.endswith('.csv')]
# Keep only the actual student distribution files (not the sheets)
csv_files = []
for filename in all_csv_files:
if 'first_sheet' not in filename and 'last_sheet' not in filename and 'template' not in filename:
csv_files.append(filename)
if not csv_files:
print(f"No student data CSV files found in '{sample_data_dir}' directory.")
return
print(f"Found {len(csv_files)} student data CSV file(s):")
for i, filename in enumerate(csv_files, 1):
print(f" {i}. {filename}")
with open(output_file, 'w', newline='', encoding='utf-8') as outfile:
writer = csv.writer(outfile)
# Process each file
for sheet_num, filename in enumerate(csv_files, 1):
csv_path = os.path.join(sample_data_dir, filename)
print(f"Processing {csv_path}...")
with open(csv_path, 'r', encoding='utf-8') as infile:
reader = csv.reader(infile)
for row in reader:
# Add sheet number and filename as first two columns
new_row = [sheet_num, filename] + row
writer.writerow(new_row)
print(f"Simple consolidated data written to {output_file}")
if __name__ == "__main__":
print("Creating consolidated CSV with sheet identifiers...")
consolidate_csv_files_simple()
print("Done! You can now upload the consolidated_data_simple.csv file for AI/ML analysis.")