117 lines
4.2 KiB
Python
117 lines
4.2 KiB
Python
#!/usr/bin/env python
|
|
"""
|
|
consolidate_csv.py - Consolidates all CSV files in sample_data directory into a single CSV
|
|
with sheet identifiers to distinguish between different original files
|
|
"""
|
|
|
|
import csv
|
|
import os
|
|
|
|
|
|
def consolidate_csv_files():
|
|
sample_data_dir = "sample_data"
|
|
output_file = "consolidated_data.csv"
|
|
|
|
if not os.path.exists(sample_data_dir):
|
|
print(f"Directory '{sample_data_dir}' not found.")
|
|
return
|
|
|
|
# Get all CSV files and filter out the schedule template and sheet files
|
|
all_csv_files = [f for f in os.listdir(sample_data_dir) if f.endswith('.csv')]
|
|
|
|
# Keep only the actual student distribution files (not the sheets)
|
|
csv_files = []
|
|
for filename in all_csv_files:
|
|
if 'first_sheet' not in filename and 'last_sheet' not in filename and 'template' not in filename:
|
|
csv_files.append(filename)
|
|
|
|
if not csv_files:
|
|
print(f"No student data CSV files found in '{sample_data_dir}' directory.")
|
|
return
|
|
|
|
print(f"Found {len(csv_files)} student data CSV file(s):")
|
|
for i, filename in enumerate(csv_files, 1):
|
|
print(f" {i}. {filename}")
|
|
|
|
consolidated_rows = []
|
|
|
|
for sheet_num, filename in enumerate(csv_files, 1):
|
|
csv_path = os.path.join(sample_data_dir, filename)
|
|
|
|
print(f"Processing {csv_path}...")
|
|
|
|
with open(csv_path, 'r', encoding='utf-8') as file:
|
|
reader = csv.reader(file)
|
|
rows = list(reader)
|
|
|
|
# Add a column to indicate which sheet this data came from
|
|
for row_idx, row in enumerate(rows):
|
|
# Create a new row with the sheet number as the first column
|
|
new_row = [sheet_num, filename] + row
|
|
consolidated_rows.append(new_row)
|
|
|
|
# Write consolidated data to a new CSV file
|
|
with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
|
|
writer = csv.writer(csvfile)
|
|
# Write header
|
|
writer.writerow(['Sheet_Number', 'Original_File', 'Data_Columns'])
|
|
# Write all rows
|
|
for row in consolidated_rows:
|
|
writer.writerow(row)
|
|
|
|
print(f"Consolidated data written to {output_file}")
|
|
print(f"Total rows in consolidated file: {len(consolidated_rows)}")
|
|
|
|
|
|
def consolidate_csv_files_simple():
|
|
"""
|
|
Creates a simpler consolidated CSV file with Sheet_Number and Original_File columns
|
|
"""
|
|
sample_data_dir = "sample_data"
|
|
output_file = "consolidated_data_simple.csv"
|
|
|
|
if not os.path.exists(sample_data_dir):
|
|
print(f"Directory '{sample_data_dir}' not found.")
|
|
return
|
|
|
|
# Get all CSV files
|
|
all_csv_files = [f for f in os.listdir(sample_data_dir) if f.endswith('.csv')]
|
|
|
|
# Keep only the actual student distribution files (not the sheets)
|
|
csv_files = []
|
|
for filename in all_csv_files:
|
|
if 'first_sheet' not in filename and 'last_sheet' not in filename and 'template' not in filename:
|
|
csv_files.append(filename)
|
|
|
|
if not csv_files:
|
|
print(f"No student data CSV files found in '{sample_data_dir}' directory.")
|
|
return
|
|
|
|
print(f"Found {len(csv_files)} student data CSV file(s):")
|
|
for i, filename in enumerate(csv_files, 1):
|
|
print(f" {i}. {filename}")
|
|
|
|
with open(output_file, 'w', newline='', encoding='utf-8') as outfile:
|
|
writer = csv.writer(outfile)
|
|
|
|
# Process each file
|
|
for sheet_num, filename in enumerate(csv_files, 1):
|
|
csv_path = os.path.join(sample_data_dir, filename)
|
|
|
|
print(f"Processing {csv_path}...")
|
|
|
|
with open(csv_path, 'r', encoding='utf-8') as infile:
|
|
reader = csv.reader(infile)
|
|
|
|
for row in reader:
|
|
# Add sheet number and filename as first two columns
|
|
new_row = [sheet_num, filename] + row
|
|
writer.writerow(new_row)
|
|
|
|
print(f"Simple consolidated data written to {output_file}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
print("Creating consolidated CSV with sheet identifiers...")
|
|
consolidate_csv_files_simple()
|
|
print("Done! You can now upload the consolidated_data_simple.csv file for AI/ML analysis.") |