Source code for parallel_classification

import pandas as pd
from QhX.output_parallel import classify_periods, classify_period

[docs] def process_csv_in_chunks(csv_file_path, chunk_size=10000, output_file_path='classified_periods.csv'): """ Processes a CSV file in chunks, classifying periods for each chunk. Reads the specified CSV file in chunks of a specified size, applies classification to each chunk using the `classify_periods` function, further classifies each period within the chunk using `classify_period`, and compiles the results into a single DataFrame. The final DataFrame is then saved to a new CSV file. Parameters ---------- csv_file_path : str The path to the CSV file to be processed. chunk_size : int, optional The number of rows per chunk to read from the CSV. Defaults to 10000. output_file_path : str, optional Path where the fully processed and classified CSV file will be saved. Defaults to 'classified_periods.csv'. Returns ------- None This function does not return a value. It saves the processed and classified data directly to a CSV file specified by `output_file_path`. Example ------- Below is an example of how to use the `process_csv_in_chunks` function: >>> csv_file_path = 'path/to/your/large_csv_file.csv' >>> output_file_path = 'path/to/save/classified_periods.csv' >>> process_csv_in_chunks(csv_file_path, chunk_size=10000, output_file_path=output_file_path) Processed and classified data saved to path/to/save/classified_periods.csv """ final_processed_df = pd.DataFrame() for chunk in pd.read_csv(csv_file_path, chunksize=chunk_size): classified_df = classify_periods(chunk.to_dict('records')) classified_df['classification'] = classified_df.apply(classify_period, axis=1) final_processed_df = pd.concat([final_processed_df, classified_df], ignore_index=True) final_processed_df.to_csv(output_file_path, index=False) print(f"Processed and classified data saved to {output_file_path}")