import sys
import pandas as pd
from sklearn.model_selection import train_test_split


# Define the path for the log file
# log_file_path = "/var/www/html/Darija-Ai-Train/prepare_data.log"

# # Create a log file and redirect the standard output to it
# log_file = open(log_file_path, "w")
# sys.stdout = log_file

base_dataset_path = "/var/www/html/Darija-Ai-Train/dataset.csv"

# Load the CSV file into a DataFrame
df = pd.read_csv(base_dataset_path)

print("Loaded the CSV file into a DataFrame.")
print(df.head())

print(df.tail())

# Remove rows with NaN, empty values, values equal to None, and 'text' column equal to 'None'
df = df.dropna()
df = df.dropna(subset=['transcript'], how='all')
df = df[df['transcript'].notna()]
df = df[df['transcript'] != '']
df = df[df['transcript'] != 'None']
df = df[df['transcript'] != 'none']

# Reset the index
df = df.reset_index(drop=True)
print(df)
# print("Removed rows with NaN, empty values, and specific text values.")

# # Calculate the total duration in seconds
total_duration = df["duration"].sum()

# Display the total duration in seconds
print("Total Duration in Seconds:", total_duration, "sec")

# Convert total duration to hours
total_duration_hours = total_duration / 3600  # There are 3600 seconds in an hour

# Display the total duration in hours
print("Total Duration in Hours:", total_duration_hours, "hours")

# Split the data into training, validation, and test sets
train_data, temp_data = train_test_split(df, test_size=0.1, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.3, random_state=42)

# Print the number of examples in each set
print("Number of Examples in Dataset Set:", df.shape[0])
print("Number of Examples in Training Set:", train_data.shape[0])
print("Number of Examples in Validation Set:", val_data.shape[0])
print("Number of Examples in Test Set:", test_data.shape[0])

# Select only the "path" and "transcript" columns
train_data = train_data[["path", "transcript"]]
val_data = val_data[["path", "transcript"]]
test_data = test_data[["path", "transcript"]]

# Save the modified DataFrames to CSV
train_data.to_csv("train.csv", index=False)
val_data.to_csv("dev.csv", index=False)
test_data.to_csv("test.csv", index=False)

# print("Saved the modified DataFrames to CSV.")

# Close the log file to ensure it's saved
# log_file.close()