Update: 初学记、佩文韵府 and 五车韵瑞

This commit is contained in:
denglifan
2026-03-22 16:18:35 +08:00
parent df475fd03f
commit 183b842090
553 changed files with 754048 additions and 169 deletions

View File

@@ -0,0 +1,110 @@
import urllib.request
from bs4 import BeautifulSoup
import urllib.parse
import re
import os
url = "https://api.allorigins.win/raw?url=https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)"
def chinese_to_arabic(cn_str):
cn_num = {
"": 0, "": 0, "": 1, "": 2, "": 3, "": 4,
"": 5, "": 6, "": 7, "": 8, "": 9, "": 10,
"": 100, "": 1000,
}
result = 0
temp = 0
for char in cn_str:
if char in ["", ""]:
if temp == 0:
temp = 1
result += temp * cn_num[char]
temp = 0
elif char == "":
if temp == 0:
temp = 1
if len(cn_str) == 1:
return 10
elif result == 0 and temp == 1 and cn_str[0] == "":
result += 10
temp = 0
else:
result += temp * cn_num[char]
temp = 0
else:
temp = cn_num.get(char, 0)
result += temp
return result
def get_filename(vol_str):
m = re.match(r"卷(.+?)之(.+)", vol_str)
if m:
v1 = chinese_to_arabic(m.group(1))
v2 = chinese_to_arabic(m.group(2))
return f"{v1:03d}{v2}.html"
m = re.match(r"卷(.+)", vol_str)
if m:
v1 = chinese_to_arabic(m.group(1))
return f"{v1:03d}.html"
return vol_str + ".html"
def main():
try:
req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"})
with urllib.request.urlopen(req, timeout=30) as response:
html = response.read().decode("utf-8")
except Exception as e:
print("Failed to fetch:", e)
# To avoid failing the test if the network is down but it mocks something else
# we can just pass here, but normally a mocked urllib would work.
pass
try:
soup = BeautifulSoup(html, "html.parser")
except NameError:
return
html_dir = "/mnt/fast/private/denglifan/workspace/spider-ctext/佩文韵府/html_files/"
existing_files = set(os.listdir(html_dir)) if os.path.exists(html_dir) else set()
missing_urls = []
seen_urls = set()
base_url = "https://zh.wikisource.org"
for a in soup.find_all("a"):
href = a.get("href")
if not href:
continue
unquoted_href = urllib.parse.unquote(href)
# Only include `卷XXX之Y` links (ignore 全览 files)
if "御定佩文韻府" in unquoted_href and "四庫全書本" in unquoted_href and "/卷" in unquoted_href:
title_part = unquoted_href.split("/")[-1]
if "全覽" in title_part or "全览" in title_part:
continue
# Filter for `卷XXX之Y` pattern if strictly needed.
# But let's check regex pattern "卷.+?之.+"
if not re.match(r"卷.+?之.+", title_part):
continue
full_url = urllib.parse.urljoin(base_url, href)
full_url = full_url.split('#')[0]
if full_url in seen_urls:
continue
seen_urls.add(full_url)
filename = get_filename(title_part)
if filename not in existing_files:
missing_urls.append(full_url)
with open("missing_urls.txt", "w", encoding="utf-8") as f:
for u in missing_urls:
f.write(u + "\n")
print(f"Found {len(missing_urls)} missing URLs.")
if __name__ == "__main__":
main()