ai7-m3/scheduler_bots/consolidate_csv.py

#!/usr/bin/env python
"""
consolidate_csv.py - Consolidates all CSV files in sample_data directory into a single CSV
with sheet identifiers to distinguish between different original files
"""

import csv
import os


def consolidate_csv_files():
    sample_data_dir = "sample_data"
    output_file = "consolidated_data.csv"

    if not os.path.exists(sample_data_dir):
        print(f"Directory '{sample_data_dir}' not found.")
        return

    # Get all CSV files and filter out the schedule template and sheet files
    all_csv_files = [f for f in os.listdir(sample_data_dir) if f.endswith('.csv')]

    # Keep only the actual student distribution files (not the sheets)
    csv_files = []
    for filename in all_csv_files:
        if 'first_sheet' not in filename and 'last_sheet' not in filename and 'template' not in filename:
            csv_files.append(filename)

    if not csv_files:
        print(f"No student data CSV files found in '{sample_data_dir}' directory.")
        return

    print(f"Found {len(csv_files)} student data CSV file(s):")
    for i, filename in enumerate(csv_files, 1):
        print(f"  {i}. {filename}")

    consolidated_rows = []

    for sheet_num, filename in enumerate(csv_files, 1):
        csv_path = os.path.join(sample_data_dir, filename)

        print(f"Processing {csv_path}...")

        with open(csv_path, 'r', encoding='utf-8') as file:
            reader = csv.reader(file)
            rows = list(reader)

        # Add a column to indicate which sheet this data came from
        for row_idx, row in enumerate(rows):
            # Create a new row with the sheet number as the first column
            new_row = [sheet_num, filename] + row
            consolidated_rows.append(new_row)

    # Write consolidated data to a new CSV file
    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        # Write header
        writer.writerow(['Sheet_Number', 'Original_File', 'Data_Columns'])
        # Write all rows
        for row in consolidated_rows:
            writer.writerow(row)

    print(f"Consolidated data written to {output_file}")
    print(f"Total rows in consolidated file: {len(consolidated_rows)}")


def consolidate_csv_files_simple():
    """
    Creates a simpler consolidated CSV file with Sheet_Number and Original_File columns
    """
    sample_data_dir = "sample_data"
    output_file = "consolidated_data_simple.csv"

    if not os.path.exists(sample_data_dir):
        print(f"Directory '{sample_data_dir}' not found.")
        return

    # Get all CSV files
    all_csv_files = [f for f in os.listdir(sample_data_dir) if f.endswith('.csv')]

    # Keep only the actual student distribution files (not the sheets)
    csv_files = []
    for filename in all_csv_files:
        if 'first_sheet' not in filename and 'last_sheet' not in filename and 'template' not in filename:
            csv_files.append(filename)

    if not csv_files:
        print(f"No student data CSV files found in '{sample_data_dir}' directory.")
        return

    print(f"Found {len(csv_files)} student data CSV file(s):")
    for i, filename in enumerate(csv_files, 1):
        print(f"  {i}. {filename}")

    with open(output_file, 'w', newline='', encoding='utf-8') as outfile:
        writer = csv.writer(outfile)

        # Process each file
        for sheet_num, filename in enumerate(csv_files, 1):
            csv_path = os.path.join(sample_data_dir, filename)

            print(f"Processing {csv_path}...")

            with open(csv_path, 'r', encoding='utf-8') as infile:
                reader = csv.reader(infile)

                for row in reader:
                    # Add sheet number and filename as first two columns
                    new_row = [sheet_num, filename] + row
                    writer.writerow(new_row)

    print(f"Simple consolidated data written to {output_file}")


if __name__ == "__main__":
    print("Creating consolidated CSV with sheet identifiers...")
    consolidate_csv_files_simple()
    print("Done! You can now upload the consolidated_data_simple.csv file for AI/ML analysis.")