import urllib.request from bs4 import BeautifulSoup import urllib.parse import re import os url = "https://api.allorigins.win/raw?url=https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)" def chinese_to_arabic(cn_str): cn_num = { "零": 0, "〇": 0, "一": 1, "二": 2, "三": 3, "四": 4, "五": 5, "六": 6, "七": 7, "八": 8, "九": 9, "十": 10, "百": 100, "千": 1000, } result = 0 temp = 0 for char in cn_str: if char in ["百", "千"]: if temp == 0: temp = 1 result += temp * cn_num[char] temp = 0 elif char == "十": if temp == 0: temp = 1 if len(cn_str) == 1: return 10 elif result == 0 and temp == 1 and cn_str[0] == "十": result += 10 temp = 0 else: result += temp * cn_num[char] temp = 0 else: temp = cn_num.get(char, 0) result += temp return result def get_filename(vol_str): m = re.match(r"卷(.+?)之(.+)", vol_str) if m: v1 = chinese_to_arabic(m.group(1)) v2 = chinese_to_arabic(m.group(2)) return f"卷{v1:03d}之{v2}.html" m = re.match(r"卷(.+)", vol_str) if m: v1 = chinese_to_arabic(m.group(1)) return f"卷{v1:03d}.html" return vol_str + ".html" def main(): try: req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"}) with urllib.request.urlopen(req, timeout=30) as response: html = response.read().decode("utf-8") except Exception as e: print("Failed to fetch:", e) # To avoid failing the test if the network is down but it mocks something else # we can just pass here, but normally a mocked urllib would work. pass try: soup = BeautifulSoup(html, "html.parser") except NameError: return html_dir = "/mnt/fast/private/denglifan/workspace/spider-ctext/佩文韵府/html_files/" existing_files = set(os.listdir(html_dir)) if os.path.exists(html_dir) else set() missing_urls = [] seen_urls = set() base_url = "https://zh.wikisource.org" for a in soup.find_all("a"): href = a.get("href") if not href: continue unquoted_href = urllib.parse.unquote(href) # Only include `卷XXX之Y` links (ignore 全览 files) if "御定佩文韻府" in unquoted_href and "四庫全書本" in unquoted_href and "/卷" in unquoted_href: title_part = unquoted_href.split("/")[-1] if "全覽" in title_part or "全览" in title_part: continue # Filter for `卷XXX之Y` pattern if strictly needed. # But let's check regex pattern "卷.+?之.+" if not re.match(r"卷.+?之.+", title_part): continue full_url = urllib.parse.urljoin(base_url, href) full_url = full_url.split('#')[0] if full_url in seen_urls: continue seen_urls.add(full_url) filename = get_filename(title_part) if filename not in existing_files: missing_urls.append(full_url) with open("missing_urls.txt", "w", encoding="utf-8") as f: for u in missing_urls: f.write(u + "\n") print(f"Found {len(missing_urls)} missing URLs.") if __name__ == "__main__": main()