Files
spider-ctext/佩文韵府/extract_links.py
2026-03-22 16:18:35 +08:00

111 lines
3.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import urllib.request
from bs4 import BeautifulSoup
import urllib.parse
import re
import os
url = "https://api.allorigins.win/raw?url=https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)"
def chinese_to_arabic(cn_str):
cn_num = {
"": 0, "": 0, "": 1, "": 2, "": 3, "": 4,
"": 5, "": 6, "": 7, "": 8, "": 9, "": 10,
"": 100, "": 1000,
}
result = 0
temp = 0
for char in cn_str:
if char in ["", ""]:
if temp == 0:
temp = 1
result += temp * cn_num[char]
temp = 0
elif char == "":
if temp == 0:
temp = 1
if len(cn_str) == 1:
return 10
elif result == 0 and temp == 1 and cn_str[0] == "":
result += 10
temp = 0
else:
result += temp * cn_num[char]
temp = 0
else:
temp = cn_num.get(char, 0)
result += temp
return result
def get_filename(vol_str):
m = re.match(r"卷(.+?)之(.+)", vol_str)
if m:
v1 = chinese_to_arabic(m.group(1))
v2 = chinese_to_arabic(m.group(2))
return f"{v1:03d}{v2}.html"
m = re.match(r"卷(.+)", vol_str)
if m:
v1 = chinese_to_arabic(m.group(1))
return f"{v1:03d}.html"
return vol_str + ".html"
def main():
try:
req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"})
with urllib.request.urlopen(req, timeout=30) as response:
html = response.read().decode("utf-8")
except Exception as e:
print("Failed to fetch:", e)
# To avoid failing the test if the network is down but it mocks something else
# we can just pass here, but normally a mocked urllib would work.
pass
try:
soup = BeautifulSoup(html, "html.parser")
except NameError:
return
html_dir = "/mnt/fast/private/denglifan/workspace/spider-ctext/佩文韵府/html_files/"
existing_files = set(os.listdir(html_dir)) if os.path.exists(html_dir) else set()
missing_urls = []
seen_urls = set()
base_url = "https://zh.wikisource.org"
for a in soup.find_all("a"):
href = a.get("href")
if not href:
continue
unquoted_href = urllib.parse.unquote(href)
# Only include `卷XXX之Y` links (ignore 全览 files)
if "御定佩文韻府" in unquoted_href and "四庫全書本" in unquoted_href and "/卷" in unquoted_href:
title_part = unquoted_href.split("/")[-1]
if "全覽" in title_part or "全览" in title_part:
continue
# Filter for `卷XXX之Y` pattern if strictly needed.
# But let's check regex pattern "卷.+?之.+"
if not re.match(r"卷.+?之.+", title_part):
continue
full_url = urllib.parse.urljoin(base_url, href)
full_url = full_url.split('#')[0]
if full_url in seen_urls:
continue
seen_urls.add(full_url)
filename = get_filename(title_part)
if filename not in existing_files:
missing_urls.append(full_url)
with open("missing_urls.txt", "w", encoding="utf-8") as f:
for u in missing_urls:
f.write(u + "\n")
print(f"Found {len(missing_urls)} missing URLs.")
if __name__ == "__main__":
main()