#!/home/chris/cleankitchens-env/bin/python3
"""
Process San Francisco PDF inspection reports into database
Extracts all fields and determines inspection status
"""

import os
import sys
import json
import PyPDF2
import re
import mysql.connector
from datetime import datetime
import hashlib

# Database configuration
DB_CONFIG = {
    'host': 'localhost',
    'database': 'cleankitchens',
    'user': 'root',
    'password': ''
}

class SFInspectionProcessor:
    def __init__(self):
        self.conn = None
        self.cursor = None
        self.connect_db()
        self.create_table()
    
    def connect_db(self):
        """Connect to MySQL database"""
        try:
            self.conn = mysql.connector.connect(**DB_CONFIG)
            self.cursor = self.conn.cursor()
            print("✓ Connected to database")
        except Exception as e:
            print(f"❌ Database connection failed: {e}")
            sys.exit(1)
    
    def create_table(self):
        """Create sf_temp table if not exists"""
        create_sql = """
        CREATE TABLE IF NOT EXISTS sf_temp (
            id INT AUTO_INCREMENT PRIMARY KEY,
            inspection_id VARCHAR(50) UNIQUE NOT NULL,
            facility_name VARCHAR(255),
            address VARCHAR(500),
            inspection_date DATE,
            inspection_time TIME,
            inspector_name VARCHAR(255),
            inspector_email VARCHAR(255),
            inspector_phone VARCHAR(50),
            permit_expiration DATE,
            owner_name VARCHAR(255),
            certified_manager VARCHAR(255),
            manager_cert_expiration DATE,
            inspection_type VARCHAR(100),
            inspection_status VARCHAR(50),  -- PASS, CONDITIONAL PASS, CLOSURE
            violation_count INT DEFAULT 0,
            violations JSON,
            corrective_actions TEXT,
            observations TEXT,
            score INT,
            current_violations INT,
            total_violations INT,
            pdf_text LONGTEXT,
            pdf_filename VARCHAR(255),
            pdf_url VARCHAR(500),
            page_text TEXT,
            collected_at TIMESTAMP,
            processed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            data_hash VARCHAR(64),
            raw_data JSON,
            
            INDEX idx_inspection_date (inspection_date),
            INDEX idx_facility_name (facility_name),
            INDEX idx_inspection_status (inspection_status),
            INDEX idx_violation_count (violation_count)
        ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci
        """
        
        try:
            self.cursor.execute(create_sql)
            self.conn.commit()
            print("✓ Table sf_temp ready")
        except Exception as e:
            print(f"❌ Table creation error: {e}")
    
    def extract_pdf_text(self, pdf_path):
        """Extract text from PDF file"""
        try:
            with open(pdf_path, 'rb') as file:
                reader = PyPDF2.PdfReader(file)
                text = ""
                for page in reader.pages:
                    text += page.extract_text()
                return text
        except Exception as e:
            print(f"Error reading PDF {pdf_path}: {e}")
            return None
    
    def parse_inspection_data(self, pdf_text, json_data=None):
        """Parse inspection data from PDF text"""
        data = {}
        
        # Inspection Date
        if match := re.search(r'Insp(?:ection)?\s+Date\s*(\d{2}/\d{2}/\d{4})', pdf_text, re.IGNORECASE):
            try:
                data['inspection_date'] = datetime.strptime(match.group(1), '%m/%d/%Y').date()
            except:
                pass
        
        # Inspection Time
        if match := re.search(r'(?:Time In|Inspection Time)\s*(\d{1,2}:\d{2}\s*[AP]M)', pdf_text, re.IGNORECASE):
            try:
                data['inspection_time'] = datetime.strptime(match.group(1), '%I:%M %p').time()
            except:
                pass
        
        # Facility Name
        if match := re.search(r'Permit Name\s*([^\n]+)', pdf_text, re.IGNORECASE):
            data['facility_name'] = match.group(1).strip()
        
        # Address
        if match := re.search(r'Address\s*(\d+[^,\n]+(?:,\s*[^,\n]+)*,?\s*SAN FRANCISCO[^,\n]*CA\s*\d{5})', pdf_text, re.IGNORECASE):
            data['address'] = match.group(1).strip()
        
        # Inspector Info
        if match := re.search(r'Inspector\s+([A-Za-z\s]+?)(?:Inspector Email|Inspector Phone|\n)', pdf_text):
            data['inspector_name'] = match.group(1).strip()
        if match := re.search(r'Inspector Email\s*([\w\.\-]+@[\w\.\-]+)', pdf_text, re.IGNORECASE):
            data['inspector_email'] = match.group(1).strip()
        if match := re.search(r'Inspector Phone\s*\(?([\d\-\s\(\)]+)\)?', pdf_text, re.IGNORECASE):
            data['inspector_phone'] = match.group(1).strip()
        
        # Owner
        if match := re.search(r'Owner\s*([^\n]+?)(?:Owner|PIC Email|Phone|Certified|\n)', pdf_text):
            data['owner_name'] = match.group(1).strip()
        
        # Certified Manager
        if match := re.search(r'Certified Food Manager\s*([^\n]+?)(?:Certification|Owner|\n)', pdf_text):
            data['certified_manager'] = match.group(1).strip()
        
        # Inspection Type
        if match := re.search(r'Purpose of Inspection\s*([^\n]+)', pdf_text, re.IGNORECASE):
            data['inspection_type'] = match.group(1).strip()
        
        # CRITICAL: Determine Inspection Status from PDF
        # Look for status indicators in specific order
        data['inspection_status'] = 'UNKNOWN'
        
        # Check for visual status boxes or text
        if 'CONDITIONAL PASS' in pdf_text or 'CONDITIONAL\nPASS' in pdf_text:
            data['inspection_status'] = 'CONDITIONAL PASS'
        elif 'CLOSURE' in pdf_text and 'PASS' not in pdf_text.replace('CLOSURE', ''):
            data['inspection_status'] = 'CLOSURE'
        elif 'PASS' in pdf_text and 'CONDITIONAL' not in pdf_text:
            # Check context to ensure it's the status, not just text
            if re.search(r'PASS\s*(?:$|\n|CONDITIONAL|CLOSURE)', pdf_text):
                data['inspection_status'] = 'PASS'
        
        # Extract Current/Total violations if shown
        if match := re.search(r'Curre(?:nt)?\s*(\d+)\s*Total\s*(\d+)', pdf_text):
            data['current_violations'] = int(match.group(1))
            data['total_violations'] = int(match.group(2))
        
        # Extract violations
        violations = []
        
        # Pattern for violation codes and descriptions
        violation_pattern = r'(\d{1,2})\s*-\s*([A-Z][A-Z\s,:;&]+?)(?:Corrective Action:|Observation:|California Retail Food Code:|\d{1,2}\s*-|$)'
        
        if matches := re.findall(violation_pattern, pdf_text, re.MULTILINE | re.DOTALL):
            for code, description in matches:
                # Clean up description
                desc_clean = re.sub(r'\s+', ' ', description).strip()
                if len(desc_clean) > 10:  # Filter out too short matches
                    violations.append({
                        'code': code,
                        'description': desc_clean[:200]  # Limit length
                    })
        
        data['violations'] = violations
        data['violation_count'] = len(violations)
        
        # Extract Corrective Actions
        corrective_actions = []
        if matches := re.findall(r'Corrective Action:\s*([^:]+?)(?:Observation:|California Retail Food Code:|$)', pdf_text, re.DOTALL):
            for action in matches:
                action_clean = re.sub(r'\s+', ' ', action).strip()
                if action_clean:
                    corrective_actions.append(action_clean)
        data['corrective_actions'] = ' | '.join(corrective_actions)
        
        # Extract Observations
        observations = []
        if matches := re.findall(r'Observation:\s*([^:]+?)(?:Corrective Action:|California Retail Food Code:|$)', pdf_text, re.DOTALL):
            for obs in matches:
                obs_clean = re.sub(r'\s+', ' ', obs).strip()
                if obs_clean:
                    observations.append(obs_clean)
        data['observations'] = ' | '.join(observations)
        
        # Score (if present)
        if match := re.search(r'Score[:\s]+(\d+)', pdf_text, re.IGNORECASE):
            data['score'] = int(match.group(1))
        
        return data
    
    def process_inspection(self, pdf_path, json_path=None):
        """Process a single inspection"""
        inspection_id = os.path.splitext(os.path.basename(pdf_path))[0]
        
        # Extract PDF text
        pdf_text = self.extract_pdf_text(pdf_path)
        if not pdf_text:
            return False
        
        # Load JSON data if available
        json_data = {}
        if json_path and os.path.exists(json_path):
            try:
                with open(json_path, 'r') as f:
                    json_data = json.load(f)
            except:
                pass
        
        # Parse inspection data
        parsed_data = self.parse_inspection_data(pdf_text, json_data)
        
        # Add metadata
        parsed_data['inspection_id'] = inspection_id
        parsed_data['pdf_text'] = pdf_text
        parsed_data['pdf_filename'] = os.path.basename(pdf_path)
        parsed_data['collected_at'] = json_data.get('collected_at', datetime.now().isoformat())
        
        # Create data hash for duplicate detection
        hash_content = f"{inspection_id}_{parsed_data.get('inspection_date', '')}_{len(pdf_text)}"
        parsed_data['data_hash'] = hashlib.md5(hash_content.encode()).hexdigest()
        
        # Insert or update in database
        sql = """
        INSERT INTO sf_temp (
            inspection_id, facility_name, address, inspection_date, inspection_time,
            inspector_name, inspector_email, inspector_phone, owner_name,
            certified_manager, inspection_type, inspection_status,
            violation_count, violations, corrective_actions, observations,
            current_violations, total_violations, score,
            pdf_text, pdf_filename, collected_at, data_hash, raw_data
        ) VALUES (
            %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s
        ) ON DUPLICATE KEY UPDATE
            facility_name = VALUES(facility_name),
            inspection_status = VALUES(inspection_status),
            violation_count = VALUES(violation_count),
            violations = VALUES(violations),
            data_hash = VALUES(data_hash),
            processed_at = NOW()
        """
        
        try:
            self.cursor.execute(sql, (
                parsed_data.get('inspection_id'),
                parsed_data.get('facility_name'),
                parsed_data.get('address'),
                parsed_data.get('inspection_date'),
                parsed_data.get('inspection_time'),
                parsed_data.get('inspector_name'),
                parsed_data.get('inspector_email'),
                parsed_data.get('inspector_phone'),
                parsed_data.get('owner_name'),
                parsed_data.get('certified_manager'),
                parsed_data.get('inspection_type'),
                parsed_data.get('inspection_status'),
                parsed_data.get('violation_count', 0),
                json.dumps(parsed_data.get('violations', [])),
                parsed_data.get('corrective_actions'),
                parsed_data.get('observations'),
                parsed_data.get('current_violations'),
                parsed_data.get('total_violations'),
                parsed_data.get('score'),
                pdf_text[:65000],  # Limit text size
                parsed_data.get('pdf_filename'),
                parsed_data.get('collected_at'),
                parsed_data.get('data_hash'),
                json.dumps(json_data)
            ))
            self.conn.commit()
            
            status_icon = "🔴" if parsed_data['inspection_status'] == 'CLOSURE' else "🟡" if parsed_data['inspection_status'] == 'CONDITIONAL PASS' else "🟢" if parsed_data['inspection_status'] == 'PASS' else "⚪"
            print(f"{status_icon} {inspection_id[:8]}... - {parsed_data.get('facility_name', 'Unknown')[:30]} - Status: {parsed_data['inspection_status']} - Violations: {parsed_data['violation_count']}")
            return True
            
        except Exception as e:
            print(f"❌ Database error for {inspection_id}: {e}")
            return False
    
    def process_directory(self, pdf_dir, json_dir=None):
        """Process all PDFs in directory"""
        pdf_files = [f for f in os.listdir(pdf_dir) if f.endswith('.pdf')]
        
        print(f"\nProcessing {len(pdf_files)} PDFs...")
        print("="*50)
        
        processed = 0
        failed = 0
        
        for pdf_file in pdf_files:
            pdf_path = os.path.join(pdf_dir, pdf_file)
            json_path = None
            
            if json_dir:
                json_file = pdf_file.replace('.pdf', '.json')
                json_path = os.path.join(json_dir, json_file)
            
            if self.process_inspection(pdf_path, json_path):
                processed += 1
            else:
                failed += 1
        
        print("="*50)
        print(f"✓ Processed: {processed}")
        print(f"✗ Failed: {failed}")
        
        # Show summary
        self.show_summary()
    
    def show_summary(self):
        """Show database summary"""
        sql = """
        SELECT 
            COUNT(*) as total,
            SUM(CASE WHEN inspection_status = 'PASS' THEN 1 ELSE 0 END) as pass_count,
            SUM(CASE WHEN inspection_status = 'CONDITIONAL PASS' THEN 1 ELSE 0 END) as conditional_count,
            SUM(CASE WHEN inspection_status = 'CLOSURE' THEN 1 ELSE 0 END) as closure_count,
            AVG(violation_count) as avg_violations,
            MAX(violation_count) as max_violations
        FROM sf_temp
        """
        
        self.cursor.execute(sql)
        result = self.cursor.fetchone()
        
        print("\n📊 Database Summary:")
        print(f"  Total inspections: {result[0]}")
        print(f"  🟢 Pass: {result[1]}")
        print(f"  🟡 Conditional Pass: {result[2]}")
        print(f"  🔴 Closures: {result[3]}")
        print(f"  📋 Avg violations: {result[4]:.1f}")
        print(f"  ⚠️  Max violations: {result[5]}")

if __name__ == "__main__":
    # Process the uploaded PDFs
    processor = SFInspectionProcessor()
    
    pdf_dir = "/var/www/twin-digital-media/public_html/_sites/cleankitchens/data/sf/inspection_data/pdfs"
    json_dir = "/var/www/twin-digital-media/public_html/_sites/cleankitchens/data/sf/inspection_data/json"
    
    if os.path.exists(pdf_dir):
        processor.process_directory(pdf_dir, json_dir)
    else:
        print(f"Directory not found: {pdf_dir}")