#!/usr/bin/env python3
"""
Enhanced San Francisco Restaurant Inspection Data Scraper
Handles both HTML content and PDF downloads from MyHealthDepartment portal
"""

import json
import time
import logging
import os
import re
import requests
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
import pdfplumber
import PyPDF2
from urllib.parse import urljoin, urlparse

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

class SFInspectionScraper:
    def __init__(self, headless=True):
        """Initialize the scraper with Chrome options"""
        self.base_url = "https://inspections.myhealthdepartment.com/san-francisco"
        self.driver = None
        self.headless = headless
        self.pdf_dir = "/var/www/twin-digital-media/public_html/_sites/cleankitchens/data/sf_pdfs"
        self.setup_driver()
        self.create_directories()
        
    def create_directories(self):
        """Create necessary directories for storing PDFs"""
        os.makedirs(self.pdf_dir, exist_ok=True)
        logger.info(f"PDF directory ready: {self.pdf_dir}")
        
    def setup_driver(self):
        """Set up Chrome driver with PDF download capabilities"""
        chrome_options = Options()
        
        if self.headless:
            chrome_options.add_argument("--headless=new")
        
        # Essential options
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--window-size=1920,1080")
        chrome_options.add_argument("--start-maximized")
        chrome_options.add_argument("--disable-blink-features=AutomationControlled")
        chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
        chrome_options.add_experimental_option('useAutomationExtension', False)
        
        # Configure Chrome to download PDFs instead of displaying them
        prefs = {
            "download.default_directory": self.pdf_dir,
            "download.prompt_for_download": False,
            "download.directory_upgrade": True,
            "plugins.always_open_pdf_externally": True,
            "profile.default_content_settings.popups": 0
        }
        chrome_options.add_experimental_option("prefs", prefs)
        
        # User agent
        chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
        
        try:
            self.driver = webdriver.Chrome(options=chrome_options)
            logger.info("Chrome driver initialized successfully")
        except:
            logger.info("Downloading Chrome driver...")
            service = Service(ChromeDriverManager().install())
            self.driver = webdriver.Chrome(service=service, options=chrome_options)
            logger.info("Using downloaded Chrome driver")
    
    def download_pdf(self, pdf_url, filename):
        """Download a PDF file"""
        try:
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
            }
            response = requests.get(pdf_url, headers=headers, timeout=30)
            
            if response.status_code == 200:
                pdf_path = os.path.join(self.pdf_dir, filename)
                with open(pdf_path, 'wb') as f:
                    f.write(response.content)
                logger.info(f"PDF downloaded: {pdf_path}")
                return pdf_path
            else:
                logger.warning(f"Failed to download PDF: {response.status_code}")
                return None
        except Exception as e:
            logger.error(f"Error downloading PDF: {e}")
            return None
    
    def extract_pdf_text(self, pdf_path):
        """Extract text from PDF using multiple methods"""
        extracted_data = {
            'pdf_path': pdf_path,
            'extraction_method': None,
            'text': '',
            'metadata': {}
        }
        
        # Try pdfplumber first (better for tables)
        try:
            with pdfplumber.open(pdf_path) as pdf:
                text_parts = []
                for page_num, page in enumerate(pdf.pages, 1):
                    page_text = page.extract_text()
                    if page_text:
                        text_parts.append(f"--- Page {page_num} ---\n{page_text}")
                    
                    # Try to extract tables
                    tables = page.extract_tables()
                    if tables:
                        for table_num, table in enumerate(tables, 1):
                            text_parts.append(f"\n--- Table {table_num} on Page {page_num} ---")
                            for row in table:
                                text_parts.append(" | ".join(str(cell) if cell else "" for cell in row))
                
                extracted_data['text'] = "\n".join(text_parts)
                extracted_data['extraction_method'] = 'pdfplumber'
                extracted_data['metadata']['pages'] = len(pdf.pages)
                
                logger.info(f"Extracted {len(extracted_data['text'])} characters using pdfplumber")
                
        except Exception as e:
            logger.warning(f"pdfplumber extraction failed: {e}")
            
            # Fallback to PyPDF2
            try:
                with open(pdf_path, 'rb') as file:
                    pdf = PyPDF2.PdfReader(file)
                    text_parts = []
                    
                    for page_num, page in enumerate(pdf.pages, 1):
                        text = page.extract_text()
                        text_parts.append(f"--- Page {page_num} ---\n{text}")
                    
                    extracted_data['text'] = "\n".join(text_parts)
                    extracted_data['extraction_method'] = 'PyPDF2'
                    extracted_data['metadata']['pages'] = len(pdf.pages)
                    
                    logger.info(f"Extracted {len(extracted_data['text'])} characters using PyPDF2")
                    
            except Exception as e2:
                logger.error(f"All PDF extraction methods failed: {e2}")
        
        return extracted_data
    
    def parse_inspection_from_text(self, text):
        """Parse inspection details from extracted text"""
        inspection = {}
        
        # Common patterns in inspection reports
        patterns = {
            'facility_name': r'(?:Facility|Restaurant|Establishment)[\s:]+([^\n]+)',
            'address': r'(?:Address|Location)[\s:]+([^\n]+)',
            'date': r'(?:Inspection Date|Date of Inspection)[\s:]+(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})',
            'score': r'(?:Score|Total Score|Points)[\s:]+(\d+)',
            'grade': r'(?:Grade|Rating)[\s:]+([A-F])',
            'permit': r'(?:Permit|License)[\s#:]+([A-Z0-9-]+)',
            'type': r'(?:Inspection Type|Type)[\s:]+([^\n]+)',
            'violations': r'(?:Violation|Code|Section)[\s:]+([^\n]+)'
        }
        
        for field, pattern in patterns.items():
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                inspection[field] = match.group(1).strip()
        
        # Look for violation details
        violation_section = re.search(r'violations?(.*?)(?:corrective actions?|$)', text, re.IGNORECASE | re.DOTALL)
        if violation_section:
            inspection['violation_details'] = violation_section.group(1)[:1000]  # First 1000 chars
        
        return inspection
    
    def scrape_with_search(self, search_term="", limit=5):
        """Scrape inspections using search functionality"""
        inspections = []
        
        try:
            logger.info(f"Navigating to {self.base_url}")
            self.driver.get(self.base_url)
            time.sleep(3)
            
            # Look for search box
            search_selectors = [
                "input[type='search']",
                "input[type='text']",
                "input[placeholder*='search' i]",
                "input[placeholder*='restaurant' i]",
                "input[placeholder*='facility' i]",
                "input[name*='search' i]",
                "#search",
                ".search-input"
            ]
            
            search_box = None
            for selector in search_selectors:
                try:
                    search_box = self.driver.find_element(By.CSS_SELECTOR, selector)
                    if search_box.is_displayed():
                        logger.info(f"Found search box with selector: {selector}")
                        break
                except:
                    continue
            
            if search_box:
                # If no search term provided, just hit enter to get all results
                if search_term:
                    search_box.clear()
                    search_box.send_keys(search_term)
                    logger.info(f"Searched for: {search_term}")
                else:
                    logger.info("Searching for all facilities (empty search)")
                
                search_box.send_keys(Keys.RETURN)
                time.sleep(3)
                
                # Wait for results to load
                self.driver.save_screenshot("/tmp/sf_search_results.png")
                logger.info("Search results screenshot saved")
            
            # Get page source and parse
            page_source = self.driver.page_source
            soup = BeautifulSoup(page_source, 'html.parser')
            
            # Look for result items (try multiple strategies)
            result_selectors = [
                ".result-item",
                ".inspection-result",
                ".facility-row",
                ".restaurant-item",
                "tr[data-facility]",
                "div[data-restaurant]",
                ".list-group-item",
                "article",
                ".card"
            ]
            
            results = []
            for selector in result_selectors:
                results = soup.select(selector)
                if results:
                    logger.info(f"Found {len(results)} results with selector: {selector}")
                    break
            
            # If no specific result containers, look for links
            if not results:
                links = soup.find_all('a', href=True)
                facility_links = []
                
                for link in links:
                    href = link.get('href', '')
                    text = link.get_text(strip=True)
                    
                    # Filter for facility/inspection links
                    if any(keyword in href.lower() for keyword in ['facility', 'inspection', 'report', 'detail', 'view']):
                        if text and len(text) > 5 and not any(skip in text.lower() for skip in ['home', 'about', 'contact', 'search']):
                            facility_links.append(link)
                
                results = facility_links[:limit * 2]  # Get more in case some fail
                logger.info(f"Found {len(results)} potential facility links")
            
            # Process each result
            for i, result in enumerate(results[:limit]):
                try:
                    inspection_data = {
                        'index': i + 1,
                        'scraped_at': datetime.now().isoformat()
                    }
                    
                    # Extract basic info from result element
                    if hasattr(result, 'get_text'):
                        text = result.get_text(separator=' ', strip=True)
                        inspection_data['result_text'] = text[:500]
                        
                        # Try to parse facility name
                        lines = text.split('\n')
                        if lines:
                            inspection_data['facility_name'] = lines[0][:100]
                    
                    # Look for links to detail pages or PDFs
                    if hasattr(result, 'find_all'):
                        links = result.find_all('a', href=True)
                    else:
                        links = [result] if hasattr(result, 'get') and result.get('href') else []
                    
                    for link in links:
                        href = link.get('href', '')
                        
                        # Check if it's a PDF
                        if href.endswith('.pdf') or 'pdf' in href.lower():
                            pdf_url = urljoin(self.base_url, href)
                            inspection_data['pdf_url'] = pdf_url
                            
                            # Download and extract PDF
                            pdf_filename = f"inspection_{i+1}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.pdf"
                            pdf_path = self.download_pdf(pdf_url, pdf_filename)
                            
                            if pdf_path:
                                pdf_data = self.extract_pdf_text(pdf_path)
                                inspection_data['pdf_text'] = pdf_data['text'][:2000]  # First 2000 chars
                                inspection_data['pdf_metadata'] = pdf_data['metadata']
                                
                                # Parse inspection details from PDF
                                parsed = self.parse_inspection_from_text(pdf_data['text'])
                                inspection_data.update(parsed)
                        
                        # Check if it's a detail page link
                        elif any(keyword in href.lower() for keyword in ['detail', 'view', 'facility', 'report']):
                            detail_url = urljoin(self.base_url, href)
                            inspection_data['detail_url'] = detail_url
                            
                            # Try to visit detail page
                            try:
                                # Open in new tab to preserve search results
                                self.driver.execute_script(f"window.open('{detail_url}', '_blank');")
                                self.driver.switch_to.window(self.driver.window_handles[-1])
                                time.sleep(2)
                                
                                detail_soup = BeautifulSoup(self.driver.page_source, 'html.parser')
                                detail_text = detail_soup.get_text(separator=' ', strip=True)
                                inspection_data['detail_text'] = detail_text[:2000]
                                
                                # Parse details
                                parsed = self.parse_inspection_from_text(detail_text)
                                inspection_data.update(parsed)
                                
                                # Look for PDF links on detail page
                                pdf_links = detail_soup.find_all('a', href=lambda x: x and x.endswith('.pdf'))
                                if pdf_links:
                                    inspection_data['detail_pdf_links'] = [urljoin(detail_url, link['href']) for link in pdf_links[:3]]
                                
                                # Close tab and switch back
                                self.driver.close()
                                self.driver.switch_to.window(self.driver.window_handles[0])
                                
                            except Exception as e:
                                logger.warning(f"Error visiting detail page: {e}")
                                # Make sure we're back on the main window
                                if len(self.driver.window_handles) > 1:
                                    self.driver.close()
                                    self.driver.switch_to.window(self.driver.window_handles[0])
                    
                    inspections.append(inspection_data)
                    logger.info(f"Processed inspection {i+1}/{limit}")
                    
                except Exception as e:
                    logger.warning(f"Error processing result {i+1}: {e}")
                    continue
            
        except Exception as e:
            logger.error(f"Error during scraping: {e}")
            
            # Save debug info
            if self.driver:
                with open("/tmp/sf_page_source.html", "w") as f:
                    f.write(self.driver.page_source)
                logger.info("Page source saved to /tmp/sf_page_source.html")
        
        return inspections
    
    def close(self):
        """Close the browser driver"""
        if self.driver:
            self.driver.quit()
            logger.info("Browser closed")

def main():
    """Main function to run the enhanced scraper"""
    logger.info("Starting enhanced San Francisco inspection scraper...")
    
    scraper = SFInspectionScraper(headless=True)
    
    try:
        # Try searching without a term to get recent inspections
        logger.info("Attempting to scrape recent inspections...")
        inspections = scraper.scrape_with_search(search_term="", limit=5)
        
        # If no results, try searching for a common term
        if not inspections:
            logger.info("No results from empty search, trying 'restaurant'...")
            inspections = scraper.scrape_with_search(search_term="restaurant", limit=5)
        
        # Save results
        output_file = "/var/www/twin-digital-media/public_html/_sites/cleankitchens/data/sf_enhanced_results.json"
        with open(output_file, "w") as f:
            json.dump(inspections, f, indent=2, default=str)
        logger.info(f"Results saved to {output_file}")
        
        # Display summary
        print("\n" + "="*60)
        print("ENHANCED SCRAPING SUMMARY")
        print("="*60)
        print(f"Total records scraped: {len(inspections)}")
        print(f"PDFs downloaded: {len([i for i in inspections if 'pdf_url' in i])}")
        print(f"Detail pages visited: {len([i for i in inspections if 'detail_url' in i])}")
        
        print("\nExtracted Information:")
        for i, inspection in enumerate(inspections, 1):
            print(f"\n--- Inspection {i} ---")
            
            # Display key fields
            key_fields = ['facility_name', 'address', 'date', 'score', 'grade', 'permit', 'type']
            for field in key_fields:
                if field in inspection:
                    print(f"  {field}: {inspection[field]}")
            
            # Show if we have PDF or detail data
            if 'pdf_url' in inspection:
                print(f"  PDF: {inspection['pdf_url']}")
            if 'detail_url' in inspection:
                print(f"  Detail Page: {inspection['detail_url']}")
            
            # Show sample of extracted text
            if 'pdf_text' in inspection:
                print(f"  PDF Text Sample: {inspection['pdf_text'][:200]}...")
            elif 'detail_text' in inspection:
                print(f"  Detail Text Sample: {inspection['detail_text'][:200]}...")
        
        # Create DataFrame for analysis
        if inspections:
            # Flatten data for CSV
            flat_data = []
            for insp in inspections:
                flat_record = {
                    'facility_name': insp.get('facility_name', ''),
                    'address': insp.get('address', ''),
                    'inspection_date': insp.get('date', ''),
                    'score': insp.get('score', ''),
                    'grade': insp.get('grade', ''),
                    'permit': insp.get('permit', ''),
                    'inspection_type': insp.get('type', ''),
                    'has_pdf': 'pdf_url' in insp,
                    'has_details': 'detail_url' in insp,
                    'scraped_at': insp.get('scraped_at', '')
                }
                flat_data.append(flat_record)
            
            df = pd.DataFrame(flat_data)
            csv_file = "/var/www/twin-digital-media/public_html/_sites/cleankitchens/data/sf_enhanced_results.csv"
            df.to_csv(csv_file, index=False)
            logger.info(f"CSV saved to {csv_file}")
            
            print(f"\nCSV Summary:")
            print(df.to_string())
            
    except Exception as e:
        logger.error(f"Error in main: {e}")
        import traceback
        traceback.print_exc()
        
    finally:
        scraper.close()

if __name__ == "__main__":
    main()