Update: 初学记、佩文韵府 and 五车韵瑞

2026-03-22 16:18:35 +08:00
parent df475fd03f
commit 183b842090
553 changed files with 754048 additions and 169 deletions
--- a/佩文韵府/download_sequentially.py
+++ b/佩文韵府/download_sequentially.py
@@ -0,0 +1,71 @@
+import os
+import time
+import urllib.request
+import urllib.parse
+from urllib.error import HTTPError
+
+def main():
+    if not os.path.exists("html_files"):
+        os.makedirs("html_files")
+
+    with open("still_missing.txt", "r", encoding="utf-8") as f:
+        urls = [line.strip() for line in f if line.strip()]
+
+    proxy_support = urllib.request.ProxyHandler({'http': 'http://127.0.0.1:10808', 'https': 'http://127.0.0.1:10808'})
+    opener = urllib.request.build_opener(proxy_support)
+    urllib.request.install_opener(opener)
+
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
+        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
+        'Referer': 'https://zh.wikisource.org/'
+    }
+
+    count = 0
+    total = len(urls)
+    
+    for url in urls:
+        vol = urllib.parse.unquote(url).split('/')[-1]
+        filename = f'html_files/{vol}.html'
+        
+        if os.path.exists(filename):
+            print(f"[{count+1}/{total}] Skipping {vol} (exists)")
+            count += 1
+            continue
+            
+        print(f"[{count+1}/{total}] Downloading {vol}...")
+        
+        success = False
+        for attempt in range(5):
+            try:
+                req = urllib.request.Request(url, headers=headers)
+                response = urllib.request.urlopen(req, timeout=15)
+                html = response.read().decode('utf-8')
+                
+                with open(filename, 'w', encoding='utf-8') as out_f:
+                    out_f.write(html)
+                    
+                print(f"  -> Success!")
+                success = True
+                break
+                
+            except HTTPError as e:
+                if e.code == 403:
+                    print(f"  -> 403 Forbidden. Waiting 5 seconds...")
+                    time.sleep(5)
+                else:
+                    print(f"  -> HTTP Error {e.code}")
+            except Exception as e:
+                print(f"  -> Error: {e}")
+                
+            time.sleep(2)
+            
+        if not success:
+            print(f"  -> Failed all attempts.")
+            
+        count += 1
+        time.sleep(1) # Be nice to server
+
+if __name__ == "__main__":
+    main()