#!/usr/bin/env python3
"""
Targeted San Francisco Restaurant Inspection Scraper
Specifically designed for the MyHealthDepartment portal structure
"""

import json
import time
import logging
import os
import re
import requests
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import pandas as pd
import pdfplumber
from urllib.parse import urlparse, parse_qs

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

class SFInspectionScraper:
    def __init__(self, headless=True):
        """Initialize the scraper"""
        self.base_url = "https://inspections.myhealthdepartment.com/san-francisco"
        self.pdf_base_url = "https://inspections.myhealthdepartment.com/san-francisco/print/"
        self.driver = None
        self.headless = headless
        self.pdf_dir = "/var/www/twin-digital-media/public_html/_sites/cleankitchens/data/sf_pdfs"
        self.setup_driver()
        self.create_directories()
        
    def create_directories(self):
        """Create necessary directories"""
        os.makedirs(self.pdf_dir, exist_ok=True)
        logger.info(f"PDF directory ready: {self.pdf_dir}")
        
    def setup_driver(self):
        """Set up Chrome driver"""
        chrome_options = Options()
        
        if self.headless:
            chrome_options.add_argument("--headless=new")
        
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--window-size=1920,1080")
        chrome_options.add_argument("--disable-blink-features=AutomationControlled")
        chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
        chrome_options.add_experimental_option('useAutomationExtension', False)
        chrome_options.add_argument("user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36")
        
        try:
            self.driver = webdriver.Chrome(options=chrome_options)
            logger.info("Chrome driver initialized")
        except Exception as e:
            logger.error(f"Failed to initialize Chrome driver: {e}")
            raise
    
    def extract_inspection_ids(self, html_content):
        """Extract inspection IDs from the page"""
        soup = BeautifulSoup(html_content, 'html.parser')
        inspection_ids = []
        
        # Look for links with inspectionID parameter
        inspection_links = soup.find_all('a', href=lambda x: x and 'inspectionID=' in x)
        
        for link in inspection_links:
            href = link['href']
            # Extract the inspection ID from the URL
            if 'inspectionID=' in href:
                # Parse the inspection ID from the URL
                match = re.search(r'inspectionID=([A-F0-9\-]+)', href)
                if match:
                    inspection_id = match.group(1)
                    
                    # Get additional info from the link text or parent elements
                    facility_info = {
                        'inspection_id': inspection_id,
                        'inspection_url': f"{self.base_url}/inspection/?inspectionID={inspection_id}",
                        'link_text': link.get_text(strip=True),
                        'href': href
                    }
                    
                    # Try to get facility name from parent elements
                    parent = link.parent
                    if parent:
                        parent_text = parent.get_text(strip=True)
                        if parent_text and len(parent_text) > len(facility_info['link_text']):
                            facility_info['context_text'] = parent_text[:500]
                    
                    inspection_ids.append(facility_info)
                    logger.info(f"Found inspection ID: {inspection_id}")
        
        return inspection_ids
    
    def download_inspection_pdf(self, inspection_id):
        """Download PDF for a specific inspection"""
        # The PDF URL pattern: 
        # https://inspections.myhealthdepartment.com/san-francisco/print/?task=getPrintable&path=san-francisco&pKey=ID,ID
        pdf_url = f"{self.pdf_base_url}?task=getPrintable&path=san-francisco&pKey={inspection_id},{inspection_id}"
        
        try:
            headers = {
                'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36',
                'Accept': 'application/pdf,*/*',
                'Referer': self.base_url
            }
            
            logger.info(f"Downloading PDF from: {pdf_url}")
            response = requests.get(pdf_url, headers=headers, timeout=30)
            
            if response.status_code == 200:
                # Check if it's actually a PDF
                content_type = response.headers.get('content-type', '')
                if 'pdf' in content_type.lower() or response.content.startswith(b'%PDF'):
                    pdf_filename = f"inspection_{inspection_id}.pdf"
                    pdf_path = os.path.join(self.pdf_dir, pdf_filename)
                    
                    with open(pdf_path, 'wb') as f:
                        f.write(response.content)
                    
                    logger.info(f"PDF saved: {pdf_path} ({len(response.content):,} bytes)")
                    return pdf_path
                else:
                    logger.warning(f"Response is not a PDF. Content-Type: {content_type}")
                    # Save the response for debugging
                    debug_path = os.path.join(self.pdf_dir, f"debug_{inspection_id}.html")
                    with open(debug_path, 'wb') as f:
                        f.write(response.content)
                    logger.info(f"Debug response saved to: {debug_path}")
                    return None
            else:
                logger.warning(f"Failed to download PDF: HTTP {response.status_code}")
                return None
                
        except Exception as e:
            logger.error(f"Error downloading PDF for {inspection_id}: {e}")
            return None
    
    def extract_pdf_data(self, pdf_path):
        """Extract inspection data from PDF"""
        inspection_data = {
            'pdf_path': pdf_path,
            'extracted_at': datetime.now().isoformat()
        }
        
        try:
            with pdfplumber.open(pdf_path) as pdf:
                all_text = []
                all_tables = []
                
                for page_num, page in enumerate(pdf.pages, 1):
                    # Extract text
                    text = page.extract_text()
                    if text:
                        all_text.append(text)
                    
                    # Extract tables
                    tables = page.extract_tables()
                    if tables:
                        for table in tables:
                            all_tables.append(table)
                
                # Combine all text
                full_text = "\n".join(all_text)
                inspection_data['full_text'] = full_text
                
                # Parse key information using regex patterns
                patterns = {
                    'facility_name': r'(?:Facility Name|Restaurant|Establishment)[:\s]+([^\n]+)',
                    'address': r'(?:Address|Location)[:\s]+([^\n]+)',
                    'city': r'(?:City)[:\s]+([^\n]+)',
                    'zip': r'(?:Zip|Postal)[:\s]+(\d{5}(?:-\d{4})?)',
                    'inspection_date': r'(?:Inspection Date|Date of Inspection)[:\s]+(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})',
                    'inspection_type': r'(?:Inspection Type|Type)[:\s]+([^\n]+)',
                    'score': r'(?:Score|Total Score|Points)[:\s]+(\d+)',
                    'grade': r'(?:Grade|Rating)[:\s]+([A-F])',
                    'permit_number': r'(?:Permit|License|Permit Number)[:\s]+([^\n]+)',
                    'inspector': r'(?:Inspector|Inspected By)[:\s]+([^\n]+)',
                }
                
                for field, pattern in patterns.items():
                    match = re.search(pattern, full_text, re.IGNORECASE | re.MULTILINE)
                    if match:
                        inspection_data[field] = match.group(1).strip()
                
                # Look for violations
                violations = []
                
                # Method 1: Look for violation sections
                violation_patterns = [
                    r'(?:Violation|Infraction|Non-compliance)[\s#:]+(\d+)[^\n]*\n([^\n]+)',
                    r'(\d+)\.\s+([^\n]+(?:\n(?!\d+\.).*)*)',  # Numbered list
                    r'•\s+([^\n]+)',  # Bullet points
                ]
                
                for pattern in violation_patterns:
                    matches = re.findall(pattern, full_text, re.MULTILINE)
                    if matches:
                        for match in matches:
                            if isinstance(match, tuple):
                                violation = {
                                    'code': match[0] if len(match) > 1 else '',
                                    'description': match[1] if len(match) > 1 else match[0]
                                }
                            else:
                                violation = {'description': match}
                            violations.append(violation)
                
                if violations:
                    inspection_data['violations'] = violations[:20]  # Limit to first 20
                    inspection_data['violation_count'] = len(violations)
                
                # Process tables if found
                if all_tables:
                    inspection_data['tables_found'] = len(all_tables)
                    
                    # Look for violation table
                    for table in all_tables:
                        if table and len(table) > 1:
                            # Check if this looks like a violation table
                            header_row = table[0] if table else []
                            if any(cell and 'violation' in str(cell).lower() for cell in header_row):
                                inspection_data['violation_table'] = table[:10]  # First 10 rows
                                break
                
                # Extract text snippets for context
                inspection_data['text_preview'] = full_text[:1000]
                
                logger.info(f"Extracted {len(full_text)} characters from PDF")
                
        except Exception as e:
            logger.error(f"Error extracting PDF data: {e}")
            inspection_data['extraction_error'] = str(e)
        
        return inspection_data
    
    def scrape_inspection_details(self, inspection_id):
        """Scrape details for a specific inspection"""
        inspection_url = f"{self.base_url}/inspection/?inspectionID={inspection_id}"
        
        try:
            logger.info(f"Fetching inspection details from: {inspection_url}")
            self.driver.get(inspection_url)
            time.sleep(2)
            
            # Get page source
            page_source = self.driver.page_source
            soup = BeautifulSoup(page_source, 'html.parser')
            
            # Extract details from the page
            details = {
                'inspection_id': inspection_id,
                'url': inspection_url,
                'page_title': self.driver.title,
                'scraped_at': datetime.now().isoformat()
            }
            
            # Try to extract structured data
            # Look for common patterns in inspection pages
            info_sections = soup.find_all(['div', 'section', 'article'], class_=lambda x: x and any(
                keyword in str(x).lower() for keyword in ['detail', 'info', 'content', 'inspection']
            ) if x else False)
            
            if info_sections:
                details['page_text'] = ' '.join([section.get_text(strip=True) for section in info_sections[:3]])[:2000]
            else:
                details['page_text'] = soup.get_text(strip=True)[:2000]
            
            return details
            
        except Exception as e:
            logger.error(f"Error scraping inspection {inspection_id}: {e}")
            return {'inspection_id': inspection_id, 'error': str(e)}
    
    def scrape_recent_inspections(self, limit=5):
        """Main scraping function"""
        all_inspections = []
        
        try:
            # Navigate to main page
            logger.info(f"Navigating to {self.base_url}")
            self.driver.get(self.base_url)
            time.sleep(3)
            
            # Get page source
            page_source = self.driver.page_source
            
            # Save for debugging
            with open("/tmp/sf_main_page.html", "w") as f:
                f.write(page_source)
            logger.info("Main page saved to /tmp/sf_main_page.html")
            
            # Extract inspection IDs
            inspection_infos = self.extract_inspection_ids(page_source)
            
            if not inspection_infos:
                logger.warning("No inspection IDs found on main page")
                
                # Try to look for any links that might lead to inspections
                soup = BeautifulSoup(page_source, 'html.parser')
                all_links = soup.find_all('a', href=True)
                logger.info(f"Found {len(all_links)} total links on page")
                
                # Sample some links for debugging
                for link in all_links[:10]:
                    logger.debug(f"Link: {link.get('href', '')} - Text: {link.get_text(strip=True)[:50]}")
            
            else:
                logger.info(f"Found {len(inspection_infos)} inspection IDs")
                
                # Process up to 'limit' inspections
                for i, info in enumerate(inspection_infos[:limit]):
                    inspection_id = info['inspection_id']
                    logger.info(f"\nProcessing inspection {i+1}/{min(limit, len(inspection_infos))}: {inspection_id}")
                    
                    inspection_record = {
                        'index': i + 1,
                        'inspection_id': inspection_id,
                        'facility_name': info.get('link_text', ''),
                        'inspection_url': info['inspection_url']
                    }
                    
                    # Download PDF
                    pdf_path = self.download_inspection_pdf(inspection_id)
                    if pdf_path:
                        inspection_record['pdf_downloaded'] = True
                        inspection_record['pdf_path'] = pdf_path
                        
                        # Extract data from PDF
                        pdf_data = self.extract_pdf_data(pdf_path)
                        inspection_record.update(pdf_data)
                    else:
                        inspection_record['pdf_downloaded'] = False
                    
                    # Optionally scrape the HTML page too
                    html_details = self.scrape_inspection_details(inspection_id)
                    inspection_record['html_details'] = html_details
                    
                    all_inspections.append(inspection_record)
                    
                    # Small delay between requests
                    time.sleep(1)
        
        except Exception as e:
            logger.error(f"Error during scraping: {e}")
            import traceback
            traceback.print_exc()
        
        return all_inspections
    
    def close(self):
        """Close the browser"""
        if self.driver:
            self.driver.quit()
            logger.info("Browser closed")

def main():
    """Main function"""
    logger.info("="*60)
    logger.info("Starting targeted San Francisco inspection scraper")
    logger.info("="*60)
    
    scraper = SFInspectionScraper(headless=True)
    
    try:
        # Scrape recent inspections
        inspections = scraper.scrape_recent_inspections(limit=5)
        
        # Save results to JSON
        output_file = "/var/www/twin-digital-media/public_html/_sites/cleankitchens/data/sf_inspection_results.json"
        with open(output_file, "w") as f:
            # Remove full_text from JSON to keep file size manageable
            json_data = []
            for insp in inspections:
                json_record = {k: v for k, v in insp.items() if k != 'full_text'}
                json_data.append(json_record)
            json.dump(json_data, f, indent=2, default=str)
        
        logger.info(f"Results saved to {output_file}")
        
        # Display summary
        print("\n" + "="*60)
        print("SCRAPING SUMMARY")
        print("="*60)
        print(f"Total inspections processed: {len(inspections)}")
        print(f"PDFs downloaded: {sum(1 for i in inspections if i.get('pdf_downloaded'))}")
        
        print("\n" + "-"*60)
        print("INSPECTION DETAILS:")
        print("-"*60)
        
        for insp in inspections:
            print(f"\n[Inspection #{insp.get('index', 'N/A')}]")
            print(f"ID: {insp.get('inspection_id', 'N/A')}")
            print(f"Facility: {insp.get('facility_name', 'Unknown')}")
            
            # Show extracted data from PDF
            if insp.get('pdf_downloaded'):
                print(f"PDF: ✓ Downloaded")
                
                # Show parsed fields
                fields_to_show = ['facility_name', 'address', 'inspection_date', 'score', 
                                 'grade', 'inspection_type', 'violation_count']
                for field in fields_to_show:
                    if field in insp and insp[field]:
                        print(f"  {field.replace('_', ' ').title()}: {insp[field]}")
                
                # Show sample violations if found
                if 'violations' in insp and insp['violations']:
                    print(f"  Violations Found: {len(insp['violations'])}")
                    for v in insp['violations'][:3]:  # Show first 3
                        desc = v.get('description', '')[:100]
                        if v.get('code'):
                            print(f"    - [{v['code']}] {desc}")
                        else:
                            print(f"    - {desc}")
            else:
                print(f"PDF: ✗ Not downloaded")
            
            print("-"*40)
        
        # Create CSV summary
        if inspections:
            csv_data = []
            for insp in inspections:
                csv_record = {
                    'inspection_id': insp.get('inspection_id', ''),
                    'facility_name': insp.get('facility_name', ''),
                    'address': insp.get('address', ''),
                    'inspection_date': insp.get('inspection_date', ''),
                    'score': insp.get('score', ''),
                    'grade': insp.get('grade', ''),
                    'inspection_type': insp.get('inspection_type', ''),
                    'violations_count': insp.get('violation_count', 0),
                    'pdf_downloaded': insp.get('pdf_downloaded', False),
                    'scraped_at': insp.get('scraped_at', '')
                }
                csv_data.append(csv_record)
            
            df = pd.DataFrame(csv_data)
            csv_file = "/var/www/twin-digital-media/public_html/_sites/cleankitchens/data/sf_inspection_summary.csv"
            df.to_csv(csv_file, index=False)
            logger.info(f"CSV summary saved to {csv_file}")
            
            print(f"\nCSV Summary Table:")
            print(df.to_string())
        
    except Exception as e:
        logger.error(f"Error in main: {e}")
        import traceback
        traceback.print_exc()
    
    finally:
        scraper.close()
        
        # List created files
        print("\n" + "="*60)
        print("FILES CREATED:")
        print("="*60)
        
        files_to_check = [
            "/var/www/twin-digital-media/public_html/_sites/cleankitchens/data/sf_inspection_results.json",
            "/var/www/twin-digital-media/public_html/_sites/cleankitchens/data/sf_inspection_summary.csv",
            "/tmp/sf_main_page.html"
        ]
        
        # Check PDF directory
        if os.path.exists("/var/www/twin-digital-media/public_html/_sites/cleankitchens/data/sf_pdfs"):
            pdf_files = os.listdir("/var/www/twin-digital-media/public_html/_sites/cleankitchens/data/sf_pdfs")
            print(f"PDFs in sf_pdfs/: {len(pdf_files)} files")
            for pdf in pdf_files[:5]:  # Show first 5
                pdf_path = os.path.join("/var/www/twin-digital-media/public_html/_sites/cleankitchens/data/sf_pdfs", pdf)
                size = os.path.getsize(pdf_path)
                print(f"  - {pdf} ({size:,} bytes)")
        
        for filepath in files_to_check:
            if os.path.exists(filepath):
                size = os.path.getsize(filepath)
                print(f"  - {filepath} ({size:,} bytes)")

if __name__ == "__main__":
    main()