#!/usr/bin/env python3
"""
Debug version of SF scraper to understand the page structure
"""

import json
import time
import logging
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import os

# Configure logging
logging.basicConfig(
    level=logging.DEBUG,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

def debug_scrape():
    """Debug function to understand page structure"""
    
    # Setup Chrome with visible window for debugging
    chrome_options = Options()
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
    chrome_options.add_experimental_option('useAutomationExtension', False)
    chrome_options.add_argument("user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
    
    # Run headless for server
    chrome_options.add_argument("--headless=new")
    
    driver = webdriver.Chrome(options=chrome_options)
    
    try:
        url = "https://inspections.myhealthdepartment.com/san-francisco"
        logger.info(f"Navigating to {url}")
        driver.get(url)
        
        # Wait for page to load
        time.sleep(5)
        
        # Take screenshot
        screenshot_path = "/tmp/sf_debug_screenshot.png"
        driver.save_screenshot(screenshot_path)
        logger.info(f"Screenshot saved to {screenshot_path}")
        
        # Get page source
        page_source = driver.page_source
        with open("/tmp/sf_debug_source.html", "w") as f:
            f.write(page_source)
        logger.info("Page source saved to /tmp/sf_debug_source.html")
        
        # Parse with BeautifulSoup
        soup = BeautifulSoup(page_source, 'html.parser')
        
        # Debug information
        debug_info = {
            'url': driver.current_url,
            'title': driver.title,
            'timestamp': datetime.now().isoformat()
        }
        
        # Find all forms
        forms = soup.find_all('form')
        debug_info['forms_count'] = len(forms)
        debug_info['forms'] = []
        for i, form in enumerate(forms):
            form_info = {
                'index': i,
                'action': form.get('action', ''),
                'method': form.get('method', ''),
                'inputs': []
            }
            inputs = form.find_all(['input', 'select', 'textarea'])
            for inp in inputs:
                form_info['inputs'].append({
                    'type': inp.get('type', ''),
                    'name': inp.get('name', ''),
                    'placeholder': inp.get('placeholder', ''),
                    'value': inp.get('value', '')
                })
            debug_info['forms'].append(form_info)
        
        # Find all links
        links = soup.find_all('a', href=True)
        debug_info['links_count'] = len(links)
        debug_info['sample_links'] = []
        for link in links[:20]:  # First 20 links
            debug_info['sample_links'].append({
                'text': link.get_text(strip=True)[:50],
                'href': link['href']
            })
        
        # Find text that might indicate facilities
        text_elements = soup.find_all(text=True)
        keywords = ['restaurant', 'facility', 'inspection', 'score', 'grade', 'permit']
        debug_info['keyword_matches'] = []
        
        for text in text_elements:
            text_str = str(text).strip()
            if text_str and any(kw in text_str.lower() for kw in keywords):
                parent_tag = text.parent.name if text.parent else 'unknown'
                debug_info['keyword_matches'].append({
                    'text': text_str[:100],
                    'parent_tag': parent_tag
                })
                if len(debug_info['keyword_matches']) >= 10:
                    break
        
        # Check for iframes
        iframes = soup.find_all('iframe')
        debug_info['iframes_count'] = len(iframes)
        if iframes:
            debug_info['iframes'] = []
            for iframe in iframes:
                debug_info['iframes'].append({
                    'src': iframe.get('src', ''),
                    'id': iframe.get('id', ''),
                    'class': iframe.get('class', [])
                })
        
        # Check for JavaScript-rendered content indicators
        scripts = soup.find_all('script')
        debug_info['scripts_count'] = len(scripts)
        debug_info['has_react'] = any('react' in str(script).lower() for script in scripts)
        debug_info['has_angular'] = any('angular' in str(script).lower() for script in scripts)
        debug_info['has_vue'] = any('vue' in str(script).lower() for script in scripts)
        
        # Try to find any data-* attributes that might contain API endpoints
        elements_with_data = soup.find_all(attrs={"data-api": True})
        if elements_with_data:
            debug_info['data_api_elements'] = [
                {
                    'tag': elem.name,
                    'data-api': elem.get('data-api', '')
                } for elem in elements_with_data[:5]
            ]
        
        # Check page structure
        debug_info['page_structure'] = {
            'has_header': bool(soup.find(['header', 'nav'])),
            'has_main': bool(soup.find(['main', 'div[role="main"]'])),
            'has_search': bool(soup.find(['input[type="search"]', 'input[type="text"]', 'form'])),
            'divs_count': len(soup.find_all('div')),
            'tables_count': len(soup.find_all('table')),
            'lists_count': len(soup.find_all(['ul', 'ol']))
        }
        
        # Save debug info
        with open("/tmp/sf_debug_info.json", "w") as f:
            json.dump(debug_info, f, indent=2, default=str)
        logger.info("Debug info saved to /tmp/sf_debug_info.json")
        
        # Print summary
        print("\n" + "="*60)
        print("PAGE ANALYSIS SUMMARY")
        print("="*60)
        print(f"URL: {debug_info['url']}")
        print(f"Title: {debug_info['title']}")
        print(f"Forms found: {debug_info['forms_count']}")
        print(f"Links found: {debug_info['links_count']}")
        print(f"Iframes found: {debug_info['iframes_count']}")
        print(f"Scripts found: {debug_info['scripts_count']}")
        print(f"Has React: {debug_info['has_react']}")
        print(f"Has search input: {debug_info['page_structure']['has_search']}")
        
        if debug_info['forms']:
            print("\nForms detected:")
            for form in debug_info['forms']:
                print(f"  - Form {form['index']}: {len(form['inputs'])} inputs")
                for inp in form['inputs'][:3]:  # Show first 3 inputs
                    print(f"    - {inp['type']} ({inp['name']}): {inp['placeholder']}")
        
        if debug_info['keyword_matches']:
            print(f"\nFound {len(debug_info['keyword_matches'])} keyword matches")
            for match in debug_info['keyword_matches'][:3]:
                print(f"  - {match['text'][:50]}...")
        
        # Try interacting with the page
        logger.info("Attempting to interact with page elements...")
        
        # Look for any clickable elements
        clickable = driver.find_elements(By.CSS_SELECTOR, "button, a, input[type='submit'], div[onclick]")
        logger.info(f"Found {len(clickable)} clickable elements")
        
        # Try to find and interact with search
        try:
            # Multiple strategies to find search
            search_strategies = [
                (By.CSS_SELECTOR, "input[type='search']"),
                (By.CSS_SELECTOR, "input[type='text']"),
                (By.CSS_SELECTOR, "input[placeholder*='search' i]"),
                (By.CSS_SELECTOR, "input[placeholder*='name' i]"),
                (By.CSS_SELECTOR, "input[placeholder*='restaurant' i]"),
                (By.CSS_SELECTOR, "#search"),
                (By.CSS_SELECTOR, ".search-input"),
                (By.XPATH, "//input[contains(@placeholder, 'Search')]"),
                (By.XPATH, "//input[contains(@placeholder, 'search')]"),
            ]
            
            for strategy, selector in search_strategies:
                try:
                    search_input = driver.find_element(strategy, selector)
                    if search_input.is_displayed():
                        logger.info(f"Found search input with strategy: {strategy} - {selector}")
                        search_input.clear()
                        search_input.send_keys("Pizza")
                        search_input.send_keys(Keys.RETURN)
                        time.sleep(3)
                        
                        # Get results page
                        results_source = driver.page_source
                        with open("/tmp/sf_search_results.html", "w") as f:
                            f.write(results_source)
                        logger.info("Search results saved to /tmp/sf_search_results.html")
                        
                        driver.save_screenshot("/tmp/sf_search_results.png")
                        logger.info("Search results screenshot saved")
                        break
                except:
                    continue
        except Exception as e:
            logger.warning(f"Could not perform search: {e}")
        
    except Exception as e:
        logger.error(f"Error during debug scraping: {e}")
        import traceback
        traceback.print_exc()
    
    finally:
        driver.quit()
        logger.info("Browser closed")
        
        # Show what files were created
        print("\nDebug files created:")
        for filename in ["/tmp/sf_debug_screenshot.png", "/tmp/sf_debug_source.html", 
                        "/tmp/sf_debug_info.json", "/tmp/sf_search_results.html",
                        "/tmp/sf_search_results.png"]:
            if os.path.exists(filename):
                size = os.path.getsize(filename)
                print(f"  - {filename} ({size:,} bytes)")

if __name__ == "__main__":
    debug_scrape()