import os
import time
import urllib.request
import urllib.parse
from urllib.error import HTTPError

def main():
    if not os.path.exists("html_files"):
        os.makedirs("html_files")

    with open("still_missing.txt", "r", encoding="utf-8") as f:
        urls = [line.strip() for line in f if line.strip()]

    proxy_support = urllib.request.ProxyHandler({'http': 'http://127.0.0.1:10808', 'https': 'http://127.0.0.1:10808'})
    opener = urllib.request.build_opener(proxy_support)
    urllib.request.install_opener(opener)

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
        'Referer': 'https://zh.wikisource.org/'
    }

    count = 0
    total = len(urls)
    
    for url in urls:
        vol = urllib.parse.unquote(url).split('/')[-1]
        filename = f'html_files/{vol}.html'
        
        if os.path.exists(filename):
            print(f"[{count+1}/{total}] Skipping {vol} (exists)")
            count += 1
            continue
            
        print(f"[{count+1}/{total}] Downloading {vol}...")
        
        success = False
        for attempt in range(5):
            try:
                req = urllib.request.Request(url, headers=headers)
                response = urllib.request.urlopen(req, timeout=15)
                html = response.read().decode('utf-8')
                
                with open(filename, 'w', encoding='utf-8') as out_f:
                    out_f.write(html)
                    
                print(f"  -> Success!")
                success = True
                break
                
            except HTTPError as e:
                if e.code == 403:
                    print(f"  -> 403 Forbidden. Waiting 5 seconds...")
                    time.sleep(5)
                else:
                    print(f"  -> HTTP Error {e.code}")
            except Exception as e:
                print(f"  -> Error: {e}")
                
            time.sleep(2)
            
        if not success:
            print(f"  -> Failed all attempts.")
            
        count += 1
        time.sleep(1) # Be nice to server

if __name__ == "__main__":
    main()