-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcsv_error_analyzer.py
53 lines (40 loc) · 1.97 KB
/
csv_error_analyzer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import pandas as pd
from collections import defaultdict
# Prompt the user for the input file name
input_file = input("Enter the input CSV file name: ")
# Read the CSV file into a DataFrame
data = pd.read_csv(input_file)
# Initialize a defaultdict to store occurrences and a dictionary for aggregator totals
occurrences = defaultdict(lambda: defaultdict(int))
aggregator_totals = defaultdict(int)
# Loop through the DataFrame rows
for _, row in data.iterrows():
if pd.isna(row['message']):
print("Found a row with a missing 'message'. Skipping.")
continue
# Split the 'message' field by '|' delimiter
messages = row['message'].split('|')
# Check if there are at least 19 fields in the 'message'
if len(messages) >= 19:
num_2_content = messages[1] # Assuming this is your aggregator
num_19_content = messages[18] # Assuming this is where the error is
# Increment the count for the specific error for the aggregator
occurrences[num_2_content][num_19_content] += 1
# Increment the aggregator total
aggregator_totals[num_2_content] += 1
# Convert the defaultdict to a DataFrame
agg_df = pd.DataFrame.from_dict(occurrences, orient='index')
agg_df.reset_index(inplace=True)
agg_df.rename(columns={'index': 'Aggregator'}, inplace=True)
# Fill missing values with 0 and reset the index
agg_df.fillna(0, inplace=True)
agg_df.reset_index(drop=True, inplace=True)
# Add a "Total" column by merging with the aggregator totals
agg_df['Total'] = agg_df['Aggregator'].map(aggregator_totals)
# Calculate and add the Grand Total as a new row
grand_total = agg_df['Total'].sum()
agg_df.loc[len(agg_df)] = ['Grand Total'] + [agg_df[column].sum() for column in agg_df.columns[1:]]
# Prompt the user for the output file name
output_file = input("Enter the output CSV file name: ")
agg_df.to_csv(output_file, index=False)
print(f"Data has been processed and saved to '{output_file}'.")