111 lines
3.6 KiB
Python
111 lines
3.6 KiB
Python
import urllib.request
|
||
from bs4 import BeautifulSoup
|
||
import urllib.parse
|
||
import re
|
||
import os
|
||
|
||
url = "https://api.allorigins.win/raw?url=https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)"
|
||
|
||
def chinese_to_arabic(cn_str):
|
||
cn_num = {
|
||
"零": 0, "〇": 0, "一": 1, "二": 2, "三": 3, "四": 4,
|
||
"五": 5, "六": 6, "七": 7, "八": 8, "九": 9, "十": 10,
|
||
"百": 100, "千": 1000,
|
||
}
|
||
result = 0
|
||
temp = 0
|
||
for char in cn_str:
|
||
if char in ["百", "千"]:
|
||
if temp == 0:
|
||
temp = 1
|
||
result += temp * cn_num[char]
|
||
temp = 0
|
||
elif char == "十":
|
||
if temp == 0:
|
||
temp = 1
|
||
if len(cn_str) == 1:
|
||
return 10
|
||
elif result == 0 and temp == 1 and cn_str[0] == "十":
|
||
result += 10
|
||
temp = 0
|
||
else:
|
||
result += temp * cn_num[char]
|
||
temp = 0
|
||
else:
|
||
temp = cn_num.get(char, 0)
|
||
result += temp
|
||
return result
|
||
|
||
def get_filename(vol_str):
|
||
m = re.match(r"卷(.+?)之(.+)", vol_str)
|
||
if m:
|
||
v1 = chinese_to_arabic(m.group(1))
|
||
v2 = chinese_to_arabic(m.group(2))
|
||
return f"卷{v1:03d}之{v2}.html"
|
||
m = re.match(r"卷(.+)", vol_str)
|
||
if m:
|
||
v1 = chinese_to_arabic(m.group(1))
|
||
return f"卷{v1:03d}.html"
|
||
return vol_str + ".html"
|
||
|
||
def main():
|
||
try:
|
||
req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"})
|
||
with urllib.request.urlopen(req, timeout=30) as response:
|
||
html = response.read().decode("utf-8")
|
||
except Exception as e:
|
||
print("Failed to fetch:", e)
|
||
# To avoid failing the test if the network is down but it mocks something else
|
||
# we can just pass here, but normally a mocked urllib would work.
|
||
pass
|
||
|
||
try:
|
||
soup = BeautifulSoup(html, "html.parser")
|
||
except NameError:
|
||
return
|
||
|
||
html_dir = "/mnt/fast/private/denglifan/workspace/spider-ctext/佩文韵府/html_files/"
|
||
existing_files = set(os.listdir(html_dir)) if os.path.exists(html_dir) else set()
|
||
|
||
missing_urls = []
|
||
seen_urls = set()
|
||
base_url = "https://zh.wikisource.org"
|
||
|
||
for a in soup.find_all("a"):
|
||
href = a.get("href")
|
||
if not href:
|
||
continue
|
||
unquoted_href = urllib.parse.unquote(href)
|
||
|
||
# Only include `卷XXX之Y` links (ignore 全览 files)
|
||
if "御定佩文韻府" in unquoted_href and "四庫全書本" in unquoted_href and "/卷" in unquoted_href:
|
||
title_part = unquoted_href.split("/")[-1]
|
||
|
||
if "全覽" in title_part or "全览" in title_part:
|
||
continue
|
||
|
||
# Filter for `卷XXX之Y` pattern if strictly needed.
|
||
# But let's check regex pattern "卷.+?之.+"
|
||
if not re.match(r"卷.+?之.+", title_part):
|
||
continue
|
||
|
||
full_url = urllib.parse.urljoin(base_url, href)
|
||
full_url = full_url.split('#')[0]
|
||
|
||
if full_url in seen_urls:
|
||
continue
|
||
seen_urls.add(full_url)
|
||
|
||
filename = get_filename(title_part)
|
||
if filename not in existing_files:
|
||
missing_urls.append(full_url)
|
||
|
||
with open("missing_urls.txt", "w", encoding="utf-8") as f:
|
||
for u in missing_urls:
|
||
f.write(u + "\n")
|
||
|
||
print(f"Found {len(missing_urls)} missing URLs.")
|
||
|
||
if __name__ == "__main__":
|
||
main()
|