#!/usr/bin/env python3
"""
Test accessing SF inspection site through various proxy methods
"""

import requests
import json
import logging
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import subprocess
import time

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

def test_with_tor():
    """Test using Tor network for anonymization"""
    logger.info("Testing with Tor...")
    
    try:
        # Check if tor is installed
        result = subprocess.run(["which", "tor"], capture_output=True, text=True)
        if not result.stdout:
            logger.info("Installing Tor...")
            subprocess.run(["sudo", "apt", "install", "-y", "tor"], capture_output=True)
            time.sleep(2)
        
        # Start tor service
        logger.info("Starting Tor service...")
        subprocess.run(["sudo", "service", "tor", "start"], capture_output=True)
        time.sleep(5)
        
        # Test with requests through Tor
        proxies = {
            'http': 'socks5h://127.0.0.1:9050',
            'https': 'socks5h://127.0.0.1:9050'
        }
        
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        
        # First check our IP
        logger.info("Checking IP address through Tor...")
        ip_check = requests.get('https://api.ipify.org?format=json', proxies=proxies, timeout=30)
        logger.info(f"Current IP through Tor: {ip_check.json()}")
        
        # Now try the target site
        logger.info("Attempting to access SF site through Tor...")
        response = requests.get(
            'https://inspections.myhealthdepartment.com/san-francisco',
            headers=headers,
            proxies=proxies,
            timeout=30
        )
        
        logger.info(f"Response status: {response.status_code}")
        
        if response.status_code == 200:
            logger.info("✅ SUCCESS with Tor!")
            with open("/tmp/sf_tor_response.html", "w") as f:
                f.write(response.text)
            return True
        else:
            logger.warning(f"❌ Still blocked with Tor: {response.status_code}")
            return False
            
    except Exception as e:
        logger.error(f"Tor test failed: {e}")
        return False

def test_with_free_proxy():
    """Test using free proxy services"""
    logger.info("Testing with free proxies...")
    
    # List of free proxy services to try
    free_proxies = [
        {'http': 'http://proxy.server:3128', 'https': 'http://proxy.server:3128'},
        # Note: Free proxies are often unreliable and may not work
    ]
    
    # Try to get a list of free proxies
    try:
        logger.info("Fetching free proxy list...")
        # Get free proxies from proxy-list
        proxy_list_url = "https://www.proxy-list.download/api/v1/get?type=https"
        response = requests.get(proxy_list_url, timeout=10)
        
        if response.status_code == 200:
            proxy_ips = response.text.strip().split('\n')[:5]  # Get first 5
            logger.info(f"Found {len(proxy_ips)} free proxies to test")
            
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
            }
            
            for proxy_ip in proxy_ips:
                try:
                    proxies = {
                        'http': f'http://{proxy_ip}',
                        'https': f'http://{proxy_ip}'
                    }
                    
                    logger.info(f"Testing proxy: {proxy_ip}")
                    response = requests.get(
                        'https://inspections.myhealthdepartment.com/san-francisco',
                        headers=headers,
                        proxies=proxies,
                        timeout=10
                    )
                    
                    if response.status_code == 200:
                        logger.info(f"✅ SUCCESS with proxy {proxy_ip}!")
                        with open("/tmp/sf_proxy_response.html", "w") as f:
                            f.write(response.text)
                        return True
                    else:
                        logger.warning(f"Proxy {proxy_ip} returned {response.status_code}")
                        
                except Exception as e:
                    logger.debug(f"Proxy {proxy_ip} failed: {e}")
                    continue
                    
    except Exception as e:
        logger.error(f"Free proxy test failed: {e}")
    
    return False

def test_with_selenium_proxy():
    """Test using Selenium with proxy"""
    logger.info("Testing Selenium with proxy configuration...")
    
    try:
        chrome_options = Options()
        
        # Try using a SOCKS proxy through Tor
        chrome_options.add_argument('--proxy-server=socks5://127.0.0.1:9050')
        
        # Other stealth options
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--disable-dev-shm-usage')
        chrome_options.add_argument('--disable-blink-features=AutomationControlled')
        chrome_options.add_argument('--headless=new')
        chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
        chrome_options.add_experimental_option('useAutomationExtension', False)
        
        driver = webdriver.Chrome(options=chrome_options)
        
        # Check IP
        driver.get('https://api.ipify.org')
        ip_text = driver.find_element("tag name", "body").text
        logger.info(f"Selenium IP through proxy: {ip_text}")
        
        # Try target site
        driver.get('https://inspections.myhealthdepartment.com/san-francisco')
        time.sleep(3)
        
        page_title = driver.title
        
        if "403" not in page_title:
            logger.info("✅ SUCCESS with Selenium proxy!")
            driver.save_screenshot("/tmp/sf_selenium_proxy.png")
            with open("/tmp/sf_selenium_proxy.html", "w") as f:
                f.write(driver.page_source)
            driver.quit()
            return True
        else:
            logger.warning("❌ Still blocked with Selenium proxy")
            driver.quit()
            return False
            
    except Exception as e:
        logger.error(f"Selenium proxy test failed: {e}")
        return False

def test_with_different_headers():
    """Test with various header combinations"""
    logger.info("Testing with different header combinations...")
    
    header_sets = [
        {
            # Mobile browser
            'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Mobile/15E148 Safari/604.1',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.9',
            'Accept-Encoding': 'gzip, deflate, br',
            'Connection': 'keep-alive',
        },
        {
            # Googlebot
            'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
        },
        {
            # Old browser
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1',
        }
    ]
    
    for i, headers in enumerate(header_sets, 1):
        try:
            logger.info(f"Testing header set {i}: {headers.get('User-Agent', '')[:50]}...")
            response = requests.get(
                'https://inspections.myhealthdepartment.com/san-francisco',
                headers=headers,
                timeout=10
            )
            
            if response.status_code == 200:
                logger.info(f"✅ SUCCESS with header set {i}!")
                with open(f"/tmp/sf_headers_{i}.html", "w") as f:
                    f.write(response.text)
                return True
            else:
                logger.warning(f"Header set {i} returned {response.status_code}")
                
        except Exception as e:
            logger.debug(f"Header set {i} failed: {e}")
    
    return False

def main():
    logger.info("="*60)
    logger.info("SF INSPECTION SITE - PROXY/IP TESTING")
    logger.info("="*60)
    
    results = {
        'tor': False,
        'free_proxy': False,
        'selenium_proxy': False,
        'headers': False
    }
    
    # Test different methods
    logger.info("\n1. Testing with Tor network...")
    results['tor'] = test_with_tor()
    
    logger.info("\n2. Testing with different headers...")
    results['headers'] = test_with_different_headers()
    
    logger.info("\n3. Testing with free proxies...")
    results['free_proxy'] = test_with_free_proxy()
    
    if results['tor']:
        logger.info("\n4. Testing Selenium with Tor proxy...")
        results['selenium_proxy'] = test_with_selenium_proxy()
    
    # Summary
    print("\n" + "="*60)
    print("PROXY TEST RESULTS")
    print("="*60)
    
    for method, success in results.items():
        status = "✅ SUCCESS" if success else "❌ FAILED"
        print(f"{method:20} : {status}")
    
    if any(results.values()):
        print("\n🎉 At least one method worked!")
        print("Check /tmp/ directory for saved HTML files")
    else:
        print("\n😞 All proxy methods failed")
        print("\nRemaining options:")
        print("1. Use a paid proxy service (residential IPs)")
        print("2. Use a cloud browser service")
        print("3. Run the scraper from a different server/VPS")
        print("4. Use your local machine as a proxy server")
    
    # Stop Tor if we started it
    subprocess.run(["sudo", "service", "tor", "stop"], capture_output=True)

if __name__ == "__main__":
    main()