import os
import json
import logging
import requests
import pandas as pd
from io import BytesIO
import concurrent.futures
from pydub import AudioSegment

# Set up logging
logging.basicConfig(filename='audio_downloader.log', level=logging.INFO, format='%(asctime)s - %(levelname)s: %(message)s')

# Define constants
API_URL = "http://102.53.12.137:3000/api/segment-tranning/"
CSV_FILE_NAME = "dataset.csv"
OUTPUT_FOLDER = "audios"

# Ensure the output folder exists
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

# Define a function to download and process audio
def process_audio(item):
    file_url = item.get("file_url")
    ma_transcript = item.get("ma_transcript")
    file_name = os.path.basename(file_url)
    
    try:

        # Skip entries with an empty ma_transcript
        if not ma_transcript:
            logging.warning(f"Skipped audio ({file_name}): Empty ma_transcript")
            return None
        
        # Download audio file
        response = requests.get(file_url)
        response.raise_for_status()
        audio_data = BytesIO(response.content)

        # Convert audio to WAV
        audio = AudioSegment.from_file(audio_data, format="mp4")
        audio_duration = len(audio) / 1000  # milliseconds to seconds

        # Save the audio as WAV
        wav_file_path = os.path.join(OUTPUT_FOLDER, os.path.splitext(file_name)[0] + ".wav")
        audio.export(wav_file_path, format="wav")

        # Create a dictionary to store audio information
        audio_data_entry = {
            "path": wav_file_path,
            "transcript": ma_transcript,
            "duration": audio_duration
        }

        logging.info(f"Converted and processed: {file_name}")
        return audio_data_entry

    except requests.RequestException as e:
        logging.error(f"Error downloading audio ({file_name}): {str(e)}")
    except Exception as e:
        logging.error(f"Error processing audio ({file_name}): {str(e)}")
    
    return None

# Fetch data from the API
def fetch_data(api_url):
    try:
        response = requests.get(api_url)
        response.raise_for_status()
        return json.loads(response.text)
    except requests.RequestException as e:
        logging.error(f"Error fetching API data: {str(e)}")
        return None

# Initialize an empty list to store audio data
audio_data_list = []

# Start the download and processing loop
while API_URL:
    api_data = fetch_data(API_URL)
    if not api_data:
        break

    audio_items = api_data.get("results", [])

    # Process audio files concurrently
    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
        audio_entries = list(executor.map(process_audio, audio_items))

    # Filter out None entries (failed downloads or processing)
    audio_entries = [entry for entry in audio_entries if entry is not None]
    
    # Append the audio data to the list
    audio_data_list.extend(audio_entries)

    # Update API URL for pagination
    API_URL = api_data.get("next")

# Create a DataFrame from the audio data list
df = pd.DataFrame(audio_data_list)

# Save the DataFrame to a CSV file
df.to_csv(CSV_FILE_NAME, index=False)

print("Audio files downloaded and CSV created. Check 'audio_downloader.log' for details.")
