import os import time import urllib.request import urllib.parse from urllib.error import HTTPError def main(): if not os.path.exists("html_files"): os.makedirs("html_files") with open("still_missing.txt", "r", encoding="utf-8") as f: urls = [line.strip() for line in f if line.strip()] proxy_support = urllib.request.ProxyHandler({'http': 'http://127.0.0.1:10808', 'https': 'http://127.0.0.1:10808'}) opener = urllib.request.build_opener(proxy_support) urllib.request.install_opener(opener) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 'Referer': 'https://zh.wikisource.org/' } count = 0 total = len(urls) for url in urls: vol = urllib.parse.unquote(url).split('/')[-1] filename = f'html_files/{vol}.html' if os.path.exists(filename): print(f"[{count+1}/{total}] Skipping {vol} (exists)") count += 1 continue print(f"[{count+1}/{total}] Downloading {vol}...") success = False for attempt in range(5): try: req = urllib.request.Request(url, headers=headers) response = urllib.request.urlopen(req, timeout=15) html = response.read().decode('utf-8') with open(filename, 'w', encoding='utf-8') as out_f: out_f.write(html) print(f" -> Success!") success = True break except HTTPError as e: if e.code == 403: print(f" -> 403 Forbidden. Waiting 5 seconds...") time.sleep(5) else: print(f" -> HTTP Error {e.code}") except Exception as e: print(f" -> Error: {e}") time.sleep(2) if not success: print(f" -> Failed all attempts.") count += 1 time.sleep(1) # Be nice to server if __name__ == "__main__": main()