import os
import re
import time
import json
import logging
import subprocess
import requests
import pandas as pd
import urllib.parse
import platform
import shutil
from datetime import datetime
from typing import List, Dict, Optional, Literal
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.edge.service import Service as EdgeService
from selenium.webdriver.edge.options import Options as EdgeOptions
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.chrome.options import Options as ChromeOptions
from selenium.webdriver.firefox.service import Service as FirefoxService
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium.webdriver.safari.service import Service as SafariService
from selenium.webdriver.safari.options import Options as SafariOptions
from webdriver_manager.chrome import ChromeDriverManager
from webdriver_manager.firefox import GeckoDriverManager
from webdriver_manager.microsoft import EdgeChromiumDriverManager


class WebSearchTool:
    """
    Cross-platform web search tool that supports multiple search engines
    and extracts comprehensive page information including links, titles, and metadata.
    """
    
    SEARCH_ENGINES = {
        'google': 'https://www.google.com/search?q={}&num={}',
        'bing': 'https://www.bing.com/search?q={}',
        'yandex': 'https://yandex.com/search/?text={}',
        'duckduckgo': 'https://duckduckgo.com/?q={}',
        'yahoo': 'https://search.yahoo.com/search?p={}',
        'baidu': 'https://www.baidu.com/s?wd={}'
    }
    
    def __init__(self, 
                search_engine: str = 'google',
                headless: bool = True, 
                max_results: int = 10, 
                delay: int = 2,
                timeout: int = 30,
                output_dir: str = "search_results"
        ):
        """
        Initialize the WebSearchTool with configurable parameters.
        
        Args:
            search_engine (str): Search engine to use ('google', 'bing', 'yandex', etc.)
            headless (bool): Run browser in headless mode
            max_results (int): Maximum number of results to extract per query
            delay (int): Delay between requests in seconds
            timeout (int): Timeout for page loads in seconds
            output_dir (str): Directory to save results
        """
        self.search_engine = search_engine.lower()
        self.max_results = max_results
        self.delay = delay
        self.timeout = timeout
        self.output_dir = output_dir
        
        # Validate search engine
        if self.search_engine not in self.SEARCH_ENGINES:
            raise ValueError(f"Unsupported search engine: {search_engine}. "
                           f"Supported engines: {list(self.SEARCH_ENGINES.keys())}")
        
        self.setup_logging()
        self.system_info = self.get_system_info()
        self.available_browsers = self.detect_browsers()
        self.setup_driver(headless)
        
        # Create output directory
        os.makedirs(self.output_dir, exist_ok=True)
        
    def setup_logging(self):
        """Configure logging for the search tool."""
        log_file = os.path.join(self.output_dir, 'search_tool.log')
        os.makedirs(os.path.dirname(log_file), exist_ok=True)
        
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler(log_file),
                logging.StreamHandler()
            ]
        )
        self.logger = logging.getLogger(__name__)
        
    def get_system_info(self) -> Dict[str, str]:
        """Get system information for cross-platform compatibility."""
        return {
            'system': platform.system(),
            'release': platform.release(),
            'version': platform.version(),
            'machine': platform.machine(),
            'processor': platform.processor(),
            'python_version': platform.python_version()
        }
        
    def detect_browsers(self) -> Dict[str, str]:
        """
        Detect installed browsers across different operating systems.
        
        Returns:
            Dict[str, str]: Dictionary of detected browsers and their versions
        """
        browsers = {}
        system = self.system_info['system']
        
        # Common browser detection methods
        browser_commands = {
            'chrome': {
                'Windows': ['reg', 'query', 'HKEY_CURRENT_USER\\Software\\Google\\Chrome\\BLBeacon', '/v', 'version'],
                'Darwin': ['/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', '--version'],
                'Linux': ['google-chrome', '--version']
            },
            'firefox': {
                'Windows': ['reg', 'query', 'HKEY_LOCAL_MACHINE\\SOFTWARE\\Mozilla\\Mozilla Firefox', '/v', 'CurrentVersion'],
                'Darwin': ['/Applications/Firefox.app/Contents/MacOS/firefox', '--version'],
                'Linux': ['firefox', '--version']
            },
            'edge': {
                'Windows': ['reg', 'query', 'HKEY_CURRENT_USER\\Software\\Microsoft\\Edge\\BLBeacon', '/v', 'version'],
                'Darwin': ['/Applications/Microsoft Edge.app/Contents/MacOS/Microsoft Edge', '--version'],
                'Linux': ['microsoft-edge', '--version']
            },
            'safari': {
                'Darwin': ['safari', '--version']
            }
        }
        
        # Alternative detection methods
        alt_commands = {
            'chrome': ['chromium-browser', '--version'],
            'firefox': ['firefox-esr', '--version'],
        }
        
        for browser, os_commands in browser_commands.items():
            if system in os_commands:
                try:
                    result = subprocess.run(
                        os_commands[system],
                        capture_output=True,
                        text=True,
                        timeout=10
                    )
                    
                    if result.returncode == 0:
                        if system == 'Windows' and 'REG_SZ' in result.stdout:
                            version = re.search(r"version\s+REG_SZ\s+([\d.]+)", result.stdout)
                            if version:
                                browsers[browser] = version.group(1)
                        else:
                            # Parse version from output
                            version_match = re.search(r'(\d+\.\d+\.\d+)', result.stdout)
                            if version_match:
                                browsers[browser] = version_match.group(1)
                        
                        self.logger.info(f"Detected {browser} version: {browsers.get(browser, 'unknown')}")
                        
                except Exception as e:
                    self.logger.debug(f"Could not detect {browser} via primary method: {e}")
                    
                    # Try alternative commands
                    if browser in alt_commands:
                        try:
                            result = subprocess.run(
                                alt_commands[browser],
                                capture_output=True,
                                text=True,
                                timeout=10
                            )
                            if result.returncode == 0:
                                version_match = re.search(r'(\d+\.\d+\.\d+)', result.stdout)
                                if version_match:
                                    browsers[browser] = version_match.group(1)
                                    self.logger.info(f"Detected {browser} version: {browsers[browser]}")
                        except Exception as e2:
                            self.logger.debug(f"Alternative detection for {browser} failed: {e2}")
            
            # Check if browser executable exists
            if browser not in browsers:
                executable_paths = {
                    'chrome': {
                        'Windows': ['C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe',
                                  'C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe'],
                        'Darwin': ['/Applications/Google Chrome.app/Contents/MacOS/Google Chrome'],
                        'Linux': ['/usr/bin/google-chrome', '/usr/bin/chromium-browser']
                    },
                    'firefox': {
                        'Windows': ['C:\\Program Files\\Mozilla Firefox\\firefox.exe',
                                  'C:\\Program Files (x86)\\Mozilla Firefox\\firefox.exe'],
                        'Darwin': ['/Applications/Firefox.app/Contents/MacOS/firefox'],
                        'Linux': ['/usr/bin/firefox', '/usr/bin/firefox-esr']
                    },
                    'edge': {
                        'Windows': ['C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedge.exe'],
                        'Darwin': ['/Applications/Microsoft Edge.app/Contents/MacOS/Microsoft Edge'],
                        'Linux': ['/usr/bin/microsoft-edge']
                    },
                    'safari': {
                        'Darwin': ['/Applications/Safari.app/Contents/MacOS/Safari']
                    }
                }
                
                if browser in executable_paths and system in executable_paths[browser]:
                    for path in executable_paths[browser][system]:
                        if os.path.exists(path):
                            browsers[browser] = "detected"
                            self.logger.info(f"Detected {browser} at: {path}")
                            break
        
        if not browsers:
            self.logger.warning("No browsers detected on the system")
        else:
            self.logger.info(f"Detected browsers: {list(browsers.keys())}")
        
        return browsers
        
    def setup_driver(self, headless: bool):
        """Setup WebDriver with cross-platform compatibility."""
        if not self.available_browsers:
            raise Exception("No supported browsers detected on the system")
            
        # Try browsers in order of preference
        browser_order = ["chrome", "firefox", "edge", "safari"]
        exceptions = {}
        
        for browser in browser_order:
            if browser in self.available_browsers:
                try:
                    if browser == "chrome":
                        self.setup_chrome_driver(headless)
                    elif browser == "firefox":
                        self.setup_firefox_driver(headless)
                    elif browser == "edge":
                        self.setup_edge_driver(headless)
                    elif browser == "safari":
                        self.setup_safari_driver(headless)
                    
                    self.logger.info(f"Successfully initialized {browser} WebDriver")
                    return
                    
                except Exception as e:
                    exceptions[browser] = str(e)
                    self.logger.warning(f"Failed to initialize {browser} WebDriver: {e}")
                    continue
        
        # If we get here, all browser attempts failed
        error_msg = "Failed to initialize any WebDriver:\n" + "\n".join([f"{b}: {e}" for b, e in exceptions.items()])
        self.logger.error(error_msg)
        raise Exception(error_msg)
        
    def setup_chrome_driver(self, headless: bool):
        """Setup Chrome WebDriver with cross-platform options."""
        options = ChromeOptions()
        if headless:
            options.add_argument("--headless=new")
        
        # Cross-platform Chrome options
        options.add_argument("--no-sandbox")
        options.add_argument("--disable-dev-shm-usage")
        options.add_argument("--disable-gpu")
        options.add_argument("--window-size=1920,1080")
        options.add_argument("--disable-blink-features=AutomationControlled")
        options.add_experimental_option("excludeSwitches", ["enable-automation"])
        options.add_experimental_option('useAutomationExtension', False)
        
        # User agent
        options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36")
        
        service = ChromeService(ChromeDriverManager().install())
        self.driver = webdriver.Chrome(service=service, options=options)
        self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
        
    def setup_firefox_driver(self, headless: bool):
        """Setup Firefox WebDriver with cross-platform options."""
        options = FirefoxOptions()
        if headless:
            options.add_argument("--headless")
        
        options.add_argument("--width=1920")
        options.add_argument("--height=1080")
        
        # Firefox preferences
        options.set_preference("general.useragent.override", 
                             "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/109.0")
        
        service = FirefoxService(GeckoDriverManager().install())
        self.driver = webdriver.Firefox(service=service, options=options)
        
    def setup_edge_driver(self, headless: bool):
        """Setup Edge WebDriver with cross-platform options."""
        options = EdgeOptions()
        if headless:
            options.add_argument("--headless")
        
        options.add_argument("--no-sandbox")
        options.add_argument("--disable-dev-shm-usage")
        options.add_argument("--disable-gpu")
        options.add_argument("--window-size=1920,1080")
        
        service = EdgeService(EdgeChromiumDriverManager().install())
        self.driver = webdriver.Edge(service=service, options=options)
        
    def setup_safari_driver(self, headless: bool):
        """Setup Safari WebDriver (macOS only)."""
        if self.system_info['system'] != 'Darwin':
            raise Exception("Safari WebDriver is only available on macOS")
        
        options = SafariOptions()
        # Note: Safari doesn't support headless mode
        if headless:
            self.logger.warning("Safari doesn't support headless mode, running in normal mode")
        
        service = SafariService()
        self.driver = webdriver.Safari(service=service, options=options)
        
    def get_search_url(self, query: str, max_results: int=25) -> str:
        """
        Generate search URL for the specified search engine.
        
        Args:
            query (str): Search query
            
        Returns:
            str: Formatted search URL
        """
        encoded_query = urllib.parse.quote(query)
        return self.SEARCH_ENGINES[self.search_engine].format(encoded_query, max_results)
        
    def extract_page_metadata(self, url: str) -> Dict:
        """
        Extract comprehensive metadata from a webpage.
        
        Args:
            url (str): URL of the webpage
            
        Returns:
            Dict: Dictionary containing page metadata
        """
        metadata = {
            'url': url,
            'title': None,
            'description': None,
            'keywords': None,
            'author': None,
            'publish_date': None,
            'last_modified': None,
            'canonical_url': None,
            'language': None,
            'page_size': None,
            'load_time': None,
            'status_code': None,
            'content_type': None,
            'word_count': None,
            'images_count': None,
            'links_count': None,
            'external_links_count': None,
            'social_media_links': [],
            'email_addresses': [],
            'phone_numbers': [],
            'error': None
        }
        
        start_time = time.time()
        
        try:
            # Navigate to the page
            self.driver.get(url)
            WebDriverWait(self.driver, self.timeout).until(
                EC.presence_of_element_located((By.TAG_NAME, "body"))
            )
            
            metadata['load_time'] = time.time() - start_time
            
            # Get page source and create soup
            page_source = self.driver.page_source
            soup = BeautifulSoup(page_source, 'html.parser')
            
            # Basic page information
            metadata['title'] = self.driver.title
            metadata['page_size'] = len(page_source)
            
            # Meta tags
            meta_description = soup.find('meta', attrs={'name': 'description'})
            if meta_description:
                metadata['description'] = meta_description.get('content', '')
                
            meta_keywords = soup.find('meta', attrs={'name': 'keywords'})
            if meta_keywords:
                metadata['keywords'] = meta_keywords.get('content', '')
                
            meta_author = soup.find('meta', attrs={'name': 'author'})
            if meta_author:
                metadata['author'] = meta_author.get('content', '')
                
            # Open Graph and other metadata
            og_title = soup.find('meta', property='og:title')
            if og_title and not metadata['title']:
                metadata['title'] = og_title.get('content', '')
                
            og_description = soup.find('meta', property='og:description')
            if og_description and not metadata['description']:
                metadata['description'] = og_description.get('content', '')
                
            # Canonical URL
            canonical = soup.find('link', rel='canonical')
            if canonical:
                metadata['canonical_url'] = canonical.get('href', '')
                
            # Language
            html_tag = soup.find('html')
            if html_tag:
                metadata['language'] = html_tag.get('lang', '')
                
            # Dates
            date_selectors = [
                'meta[name="article:published_time"]',
                'meta[property="article:published_time"]',
                'meta[name="pubdate"]',
                'meta[name="date"]',
                'time[datetime]'
            ]
            
            for selector in date_selectors:
                date_elem = soup.select_one(selector)
                if date_elem:
                    date_value = date_elem.get('content') or date_elem.get('datetime')
                    if date_value:
                        metadata['publish_date'] = date_value
                        break
            
            # Content analysis
            text_content = soup.get_text()
            words = text_content.split()
            metadata['word_count'] = len(words)
            
            # Count elements
            images = soup.find_all('img')
            metadata['images_count'] = len(images)
            
            links = soup.find_all('a', href=True)
            metadata['links_count'] = len(links)
            
            # External links
            domain = urllib.parse.urlparse(url).netloc
            external_links = [link for link in links 
                            if link.get('href', '').startswith('http') 
                            and domain not in link.get('href', '')]
            metadata['external_links_count'] = len(external_links)
            
            # Social media links
            social_domains = ['facebook.com', 'twitter.com', 'instagram.com', 'linkedin.com', 
                            'youtube.com', 'tiktok.com', 'pinterest.com', 'snapchat.com']
            
            for link in links:
                href = link.get('href', '')
                for social in social_domains:
                    if social in href:
                        metadata['social_media_links'].append(href)
                        break
            
            # Extract email addresses and phone numbers
            email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
            phone_pattern = r'(\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}'
            
            emails = re.findall(email_pattern, text_content)
            phones = re.findall(phone_pattern, text_content)
            
            metadata['email_addresses'] = list(set(emails))
            metadata['phone_numbers'] = list(set(phones))
            
            # HTTP response information (if available)
            try:
                response = requests.head(url, timeout=10)
                metadata['status_code'] = response.status_code
                metadata['content_type'] = response.headers.get('content-type', '')
                metadata['last_modified'] = response.headers.get('last-modified', '')
            except:
                pass
                
        except Exception as e:
            metadata['error'] = str(e)
            self.logger.error(f"Error extracting metadata from {url}: {e}")
            
        return metadata
        
    def search_and_extract(self, query: str) -> Dict:
        """
        Perform search and extract comprehensive information from results.
        
        Args:
            query (str): Search query
            
        Returns:
            Dict: Dictionary containing search results and metadata
        """
        search_results = {
            'query': query,
            'search_engine': self.search_engine,
            'timestamp': datetime.now().isoformat(),
            'system_info': self.system_info,
            'total_results': 0,
            'results': [],
            'search_metadata': {},
            'error': None
        }
        
        try:
            # Get search URL and perform search
            search_url = self.get_search_url(query, max_results=self.max_results)
            self.logger.info(f"Searching on {self.search_engine}: {query}")
            self.logger.info(f"Search URL: {search_url}")
            
            self.driver.get(search_url)
            time.sleep(self.delay)
            
            # Extract search results based on search engine
            if self.search_engine == 'google':
                results = self.extract_google_results()
            elif self.search_engine == 'bing':
                results = self.extract_bing_results()
            elif self.search_engine == 'yandex':
                results = self.extract_yandex_results()
            elif self.search_engine == 'duckduckgo':
                results = self.extract_duckduckgo_results()
            else:
                results = self.extract_generic_results()
            
            search_results['total_results'] = len(results)
            
            # Process each result
            for i, result in enumerate(results[:self.max_results]):
                self.logger.info(f"Processing result {i+1}/{min(len(results), self.max_results)}: {result.get('url', 'N/A')}")
                
                # Extract comprehensive metadata
                if result.get('url'):
                    metadata = self.extract_page_metadata(result['url'])
                    result.update(metadata)
                
                search_results['results'].append(result)
                
                # Add delay between requests
                if i < len(results) - 1:
                    time.sleep(self.delay)
                    
        except Exception as e:
            search_results['error'] = str(e)
            self.logger.error(f"Error during search: {e}")
            
        return search_results
        
    def extract_google_results(self) -> List[Dict]:
        """Extract search results from Google."""
        results = []
        
        # Google search result selectors
        result_selectors = [
            'div.g',  # Main result container
            'div[data-sokoban-container] div.g',  # Alternative container
            'div.tF2Cxc'  # Another result container
        ]
        
        result_elements = []
        for selector in result_selectors:
            elements = self.driver.find_elements(By.CSS_SELECTOR, selector)
            if elements:
                result_elements = elements
                break
        
        for element in result_elements:
            try:
                result = {}
                
                # Extract title and URL
                title_link = element.find_element(By.CSS_SELECTOR, 'h3')
                if title_link:
                    result['title'] = title_link.text
                    link_element = title_link.find_element(By.XPATH, './..')
                    result['url'] = link_element.get_attribute('href')
                
                # Extract snippet
                snippet_selectors = ['.VwiC3b', '.s3v9rd', '.st']
                for selector in snippet_selectors:
                    try:
                        snippet = element.find_element(By.CSS_SELECTOR, selector)
                        result['snippet'] = snippet.text
                        break
                    except:
                        continue
                
                # Extract displayed URL
                try:
                    cite_element = element.find_element(By.CSS_SELECTOR, 'cite')
                    result['displayed_url'] = cite_element.text
                except:
                    pass
                
                if result.get('url'):
                    results.append(result)
                    
            except Exception as e:
                self.logger.debug(f"Error extracting Google result: {e}")
                continue
                
        return results
        
    def extract_bing_results(self) -> List[Dict]:
        """Extract search results from Bing."""
        results = []
        
        result_elements = self.driver.find_elements(By.CSS_SELECTOR, '.b_algo')
        
        for element in result_elements:
            try:
                result = {}
                
                # Extract title and URL
                title_link = element.find_element(By.CSS_SELECTOR, 'h2 a')
                result['title'] = title_link.text
                result['url'] = title_link.get_attribute('href')
                
                # Extract snippet
                try:
                    snippet = element.find_element(By.CSS_SELECTOR, '.b_caption p')
                    result['snippet'] = snippet.text
                except:
                    pass
                
                # Extract displayed URL
                try:
                    cite_element = element.find_element(By.CSS_SELECTOR, '.b_attribution cite')
                    result['displayed_url'] = cite_element.text
                except:
                    pass
                
                results.append(result)
                
            except Exception as e:
                self.logger.debug(f"Error extracting Bing result: {e}")
                continue
                
        return results
        
    def extract_yandex_results(self) -> List[Dict]:
        """Extract search results from Yandex."""
        results = []
        
        result_elements = self.driver.find_elements(By.CSS_SELECTOR, '.serp-item')
        
        for element in result_elements:
            try:
                result = {}
                
                # Extract title and URL
                title_link = element.find_element(By.CSS_SELECTOR, '.organic__title-wrapper a')
                result['title'] = title_link.text
                result['url'] = title_link.get_attribute('href')
                
                # Extract snippet
                try:
                    snippet = element.find_element(By.CSS_SELECTOR, '.organic__text')
                    result['snippet'] = snippet.text
                except:
                    pass
                
                results.append(result)
                
            except Exception as e:
                self.logger.debug(f"Error extracting Yandex result: {e}")
                continue
                
        return results
        
    def extract_duckduckgo_results(self) -> List[Dict]:
        """Extract search results from DuckDuckGo."""
        results = []
        
        result_elements = self.driver.find_elements(By.CSS_SELECTOR, '.result')
        
        for element in result_elements:
            try:
                result = {}
                
                # Extract title and URL
                title_link = element.find_element(By.CSS_SELECTOR, '.result__title a')
                result['title'] = title_link.text
                result['url'] = title_link.get_attribute('href')
                
                # Extract snippet
                try:
                    snippet = element.find_element(By.CSS_SELECTOR, '.result__snippet')
                    result['snippet'] = snippet.text
                except:
                    pass
                
                results.append(result)
                
            except Exception as e:
                self.logger.debug(f"Error extracting DuckDuckGo result: {e}")
                continue
                
        return results
        
    def extract_generic_results(self) -> List[Dict]:
        """Generic result extraction for other search engines."""
        results = []
        
        # Try to find common link patterns
        link_elements = self.driver.find_elements(By.CSS_SELECTOR, 'a[href]')
        
        for link in link_elements:
            try:
                href = link.get_attribute('href')
                text = link.text.strip()
                
                # Filter out non-result links
                if (href and text and 
                    href.startswith('http') and 
                    len(text) > 10 and 
                    'search' not in href.lower()):
                    
                    results.append({
                        'title': text,
                        'url': href,
                        'snippet': ''
                    })
                    
            except Exception as e:
                continue
                
        return results[:self.max_results]
        
    def save_results(self, results: Dict, filename: str = None, format: str = 'json'):
        """
        Save search results to file.
        
        Args:
            results (Dict): Search results dictionary
            filename (str): Optional filename (auto-generated if None)
            format (str): Output format ('json', 'csv', 'both')
        
        Returns:
            str or tuple: Path(s) to saved file(s)
        """
        if filename is None:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            query_clean = re.sub(r'[^\w\s-]', '', results['query']).strip()
            query_clean = re.sub(r'[-\s]+', '_', query_clean)
            base_filename = f"search_{self.search_engine}_{query_clean}_{timestamp}"
        else:
            base_filename = os.path.splitext(filename)[0]
        
        saved_files = []
        
        try:
            if format in ['json', 'both']:
                json_filepath = os.path.join(self.output_dir, f"{base_filename}.json")
                with open(json_filepath, 'w', encoding='utf-8') as f:
                    json.dump(results, f, indent=2, ensure_ascii=False)
                
                self.logger.info(f"JSON results saved to: {json_filepath}")
                saved_files.append(json_filepath)
            
            if format in ['csv', 'both']:
                csv_filepath = self.export_to_csv_from_dict(results, f"{base_filename}.csv")
                if csv_filepath:
                    saved_files.append(csv_filepath)
            
            return saved_files[0] if len(saved_files) == 1 else tuple(saved_files)
            
        except Exception as e:
            self.logger.error(f"Error saving results: {e}")
            return None
            
    def export_to_csv_from_dict(self, results_dict: Dict, filename: str = None) -> str:
        """
        Export results dictionary directly to CSV format.
        
        Args:
            results_dict (Dict): Results dictionary
            filename (str): Optional filename
            
        Returns:
            str: Path to CSV file
        """
        try:
            if filename is None:
                timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
                query_clean = re.sub(r'[^\w\s-]', '', results_dict['query']).strip()
                query_clean = re.sub(r'[-\s]+', '_', query_clean)
                filename = f"search_{self.search_engine}_{query_clean}_{timestamp}.csv"
            
            # Flatten results for CSV
            csv_data = []
            for i, result in enumerate(results_dict.get('results', [])):
                row = {
                    'query': results_dict.get('query', ''),
                    'search_engine': results_dict.get('search_engine', ''),
                    'result_index': i + 1,
                    'title': result.get('title', ''),
                    'url': result.get('url', ''),
                    'snippet': result.get('snippet', ''),
                    'displayed_url': result.get('displayed_url', ''),
                    'description': result.get('description', ''),
                    'keywords': result.get('keywords', ''),
                    'author': result.get('author', ''),
                    'publish_date': result.get('publish_date', ''),
                    'language': result.get('language', ''),
                    'word_count': result.get('word_count', ''),
                    'images_count': result.get('images_count', ''),
                    'links_count': result.get('links_count', ''),
                    'external_links_count': result.get('external_links_count', ''),
                    'status_code': result.get('status_code', ''),
                    'content_type': result.get('content_type', ''),
                    'load_time': result.get('load_time', ''),
                    'page_size': result.get('page_size', ''),
                    'social_media_links': '; '.join(result.get('social_media_links', [])),
                    'email_addresses': '; '.join(result.get('email_addresses', [])),
                    'phone_numbers': '; '.join(result.get('phone_numbers', [])),
                    'error': result.get('error', '')
                }
                csv_data.append(row)
            
            # Create CSV file
            csv_file_path = os.path.join(self.output_dir, filename)
            df = pd.DataFrame(csv_data)
            df.to_csv(csv_file_path, index=False, encoding='utf-8')
            
            self.logger.info(f"CSV export saved to: {csv_file_path}")
            return csv_file_path
            
        except Exception as e:
            self.logger.error(f"Error exporting to CSV: {e}")
            return None
        
    def export_to_csv(self, json_file_path: str) -> str:
        """
        Export JSON results to CSV format.
        
        Args:
            json_file_path (str): Path to JSON results file
            
        Returns:
            str: Path to CSV file
        """
        try:
            with open(json_file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            csv_file_path = json_file_path.replace('.json', '.csv')
            return self.export_to_csv_from_dict(data, os.path.basename(csv_file_path))
            
        except Exception as e:
            self.logger.error(f"Error exporting to CSV: {e}")
            return None
            
    def export_to_json(self, csv_file_path: str, output_filename: str = None) -> str:
        """
        Export CSV results to JSON format.
        
        Args:
            csv_file_path (str): Path to CSV results file
            output_filename (str): Optional output filename
            
        Returns:
            str: Path to JSON file
        """
        try:
            # Read CSV file
            df = pd.read_csv(csv_file_path)
            
            if output_filename is None:
                output_filename = csv_file_path.replace('.csv', '_exported.json')
            
            # Group by query to reconstruct the original structure
            json_data = {}
            
            for query, group in df.groupby('query'):
                query_data = {
                    'query': query,
                    'search_engine': group.iloc[0]['search_engine'] if 'search_engine' in group.columns else 'unknown',
                    'timestamp': datetime.now().isoformat(),
                    'total_results': len(group),
                    'results': [],
                    'exported_from_csv': True,
                    'original_csv_file': os.path.basename(csv_file_path)
                }
                
                for _, row in group.iterrows():
                    result = {}
                    
                    # Map CSV columns back to result structure
                    column_mapping = {
                        'title': 'title',
                        'url': 'url',
                        'snippet': 'snippet',
                        'displayed_url': 'displayed_url',
                        'description': 'description',
                        'keywords': 'keywords',
                        'author': 'author',
                        'publish_date': 'publish_date',
                        'language': 'language',
                        'word_count': 'word_count',
                        'images_count': 'images_count',
                        'links_count': 'links_count',
                        'external_links_count': 'external_links_count',
                        'status_code': 'status_code',
                        'content_type': 'content_type',
                        'load_time': 'load_time',
                        'page_size': 'page_size',
                        'error': 'error'
                    }
                    
                    for csv_col, json_key in column_mapping.items():
                        if csv_col in row and pd.notna(row[csv_col]) and row[csv_col] != '':
                            value = row[csv_col]
                            # Convert numeric strings back to numbers
                            if csv_col in ['word_count', 'images_count', 'links_count', 'external_links_count', 'status_code', 'page_size']:
                                try:
                                    value = int(float(value))
                                except (ValueError, TypeError):
                                    pass
                            elif csv_col in ['load_time']:
                                try:
                                    value = float(value)
                                except (ValueError, TypeError):
                                    pass
                            result[json_key] = value
                    
                    # Handle list fields
                    list_fields = ['social_media_links', 'email_addresses', 'phone_numbers']
                    for field in list_fields:
                        if field in row and pd.notna(row[field]) and row[field] != '':
                            result[field] = [item.strip() for item in str(row[field]).split(';') if item.strip()]
                        else:
                            result[field] = []
                    
                    query_data['results'].append(result)
                
                json_data[query] = query_data
            
            # If there's only one query, return just that data
            if len(json_data) == 1:
                json_data = list(json_data.values())[0]
            
            # Save JSON file
            json_file_path = os.path.join(self.output_dir, output_filename)
            with open(json_file_path, 'w', encoding='utf-8') as f:
                json.dump(json_data, f, indent=2, ensure_ascii=False)
            
            self.logger.info(f"JSON export saved to: {json_file_path}")
            return json_file_path
            
        except Exception as e:
            self.logger.error(f"Error exporting to JSON: {e}")
            return None
            
    def export_results(self, input_file: str, output_format: str, output_filename: str = None) -> str:
        """
        Universal export function that can convert between JSON and CSV formats.
        
        Args:
            input_file (str): Path to input file (JSON or CSV)
            output_format (str): Desired output format ('json' or 'csv')
            output_filename (str): Optional output filename
            
        Returns:
            str: Path to exported file
        """
        if not os.path.exists(input_file):
            self.logger.error(f"Input file not found: {input_file}")
            return None
        
        input_format = 'json' if input_file.endswith('.json') else 'csv'
        
        if input_format == output_format:
            self.logger.warning(f"Input and output formats are the same ({output_format})")
            return input_file
        
        try:
            if input_format == 'json' and output_format == 'csv':
                return self.export_to_csv(input_file)
            elif input_format == 'csv' and output_format == 'json':
                return self.export_to_json(input_file, output_filename)
            else:
                self.logger.error(f"Unsupported conversion: {input_format} to {output_format}")
                return None
                
        except Exception as e:
            self.logger.error(f"Error during export: {e}")
            return None
            
    def search_from_file(self, file_path: str, query_column: str = 'query', 
                        sheet_name: str = None, export_format: str = 'json') -> List[str]:
        """
        Search for multiple queries from Excel/CSV file.
        
        Args:
            file_path (str): Path to the file containing queries
            query_column (str): Column name containing queries
            sheet_name (str): Sheet name for Excel files
            export_format (str): Export format ('json', 'csv', 'both')
            
        Returns:
            List[str]: List of result file paths
        """
        result_files = []
        
        try:
            # Read file
            if file_path.endswith('.csv'):
                df = pd.read_csv(file_path)
            else:
                df = pd.read_excel(file_path, sheet_name=sheet_name)
            
            if query_column not in df.columns:
                self.logger.error(f"Column '{query_column}' not found in file")
                return result_files
            
            # Process each query
            for index, row in df.iterrows():
                query = str(row[query_column]).strip()
                if query and query.lower() != 'nan':
                    self.logger.info(f"Processing query {index + 1}/{len(df)}: {query}")
                    
                    # Perform search
                    results = self.search_and_extract(query)
                    
                    # Save results in specified format
                    filename = f"batch_search_{index + 1}_{self.search_engine}"
                    saved_files = self.save_results(results, filename, export_format)
                    
                    if saved_files:
                        if isinstance(saved_files, tuple):
                            result_files.extend(saved_files)
                        else:
                            result_files.append(saved_files)
                    
                    # Delay between queries
                    if index < len(df) - 1:
                        time.sleep(self.delay * 2)  # Longer delay between queries
                        
        except Exception as e:
            self.logger.error(f"Error processing file {file_path}: {e}")
            
        return result_files
        
    def generate_summary_report(self, json_files: List[str], export_format: str = 'json') -> str:
        """
        Generate a summary report from multiple search result files.
        
        Args:
            json_files (List[str]): List of JSON result file paths
            export_format (str): Export format ('json', 'csv', 'both')
            
        Returns:
            str: Path to summary report file
        """
        try:
            summary_data = {
                'report_generated': datetime.now().isoformat(),
                'total_searches': len(json_files),
                'search_engines_used': set(),
                'total_results': 0,
                'successful_searches': 0,
                'failed_searches': 0,
                'average_results_per_search': 0,
                'total_pages_analyzed': 0,
                'common_domains': {},
                'content_types': {},
                'languages': {},
                'search_details': []
            }
            
            all_domains = []
            all_content_types = []
            all_languages = []
            
            for json_file in json_files:
                try:
                    with open(json_file, 'r', encoding='utf-8') as f:
                        data = json.load(f)
                    
                    search_detail = {
                        'query': data.get('query', ''),
                        'search_engine': data.get('search_engine', ''),
                        'results_count': len(data.get('results', [])),
                        'timestamp': data.get('timestamp', ''),
                        'error': data.get('error', '')
                    }
                    
                    summary_data['search_engines_used'].add(data.get('search_engine', ''))
                    summary_data['total_results'] += len(data.get('results', []))
                    
                    if data.get('error'):
                        summary_data['failed_searches'] += 1
                    else:
                        summary_data['successful_searches'] += 1
                    
                    # Analyze results
                    for result in data.get('results', []):
                        if result.get('url'):
                            domain = urllib.parse.urlparse(result['url']).netloc
                            all_domains.append(domain)
                            summary_data['total_pages_analyzed'] += 1
                        
                        if result.get('content_type'):
                            all_content_types.append(result['content_type'])
                        
                        if result.get('language'):
                            all_languages.append(result['language'])
                    
                    summary_data['search_details'].append(search_detail)
                    
                except Exception as e:
                    self.logger.error(f"Error processing {json_file}: {e}")
                    continue
            
            # Calculate statistics
            if summary_data['successful_searches'] > 0:
                summary_data['average_results_per_search'] = (
                    summary_data['total_results'] / summary_data['successful_searches']
                )
            
            # Count common domains, content types, and languages
            from collections import Counter
            summary_data['common_domains'] = dict(Counter(all_domains).most_common(10))
            summary_data['content_types'] = dict(Counter(all_content_types))
            summary_data['languages'] = dict(Counter(all_languages))
            summary_data['search_engines_used'] = list(summary_data['search_engines_used'])
            
            # Save summary report
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            filename = f"search_summary_report_{timestamp}"
            
            saved_files = self.save_results(summary_data, filename, export_format)
            
            if saved_files:
                if isinstance(saved_files, tuple):
                    self.logger.info(f"Summary report saved in multiple formats: {saved_files}")
                    return saved_files[0]  # Return the first file
                else:
                    self.logger.info(f"Summary report saved to: {saved_files}")
                    return saved_files
            
            return None
            
        except Exception as e:
            self.logger.error(f"Error generating summary report: {e}")
            return None
            
    def close(self):
        """Clean up resources."""
        try:
            if hasattr(self, 'driver'):
                self.driver.quit()
                self.logger.info("WebDriver closed successfully")
        except Exception as e:
            self.logger.error(f"Error closing WebDriver: {e}")