Update: 初学记、佩文韵府 and 五车韵瑞
This commit is contained in:
110
佩文韵府/extract_links.py
Normal file
110
佩文韵府/extract_links.py
Normal file
@@ -0,0 +1,110 @@
|
||||
import urllib.request
|
||||
from bs4 import BeautifulSoup
|
||||
import urllib.parse
|
||||
import re
|
||||
import os
|
||||
|
||||
url = "https://api.allorigins.win/raw?url=https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)"
|
||||
|
||||
def chinese_to_arabic(cn_str):
|
||||
cn_num = {
|
||||
"零": 0, "〇": 0, "一": 1, "二": 2, "三": 3, "四": 4,
|
||||
"五": 5, "六": 6, "七": 7, "八": 8, "九": 9, "十": 10,
|
||||
"百": 100, "千": 1000,
|
||||
}
|
||||
result = 0
|
||||
temp = 0
|
||||
for char in cn_str:
|
||||
if char in ["百", "千"]:
|
||||
if temp == 0:
|
||||
temp = 1
|
||||
result += temp * cn_num[char]
|
||||
temp = 0
|
||||
elif char == "十":
|
||||
if temp == 0:
|
||||
temp = 1
|
||||
if len(cn_str) == 1:
|
||||
return 10
|
||||
elif result == 0 and temp == 1 and cn_str[0] == "十":
|
||||
result += 10
|
||||
temp = 0
|
||||
else:
|
||||
result += temp * cn_num[char]
|
||||
temp = 0
|
||||
else:
|
||||
temp = cn_num.get(char, 0)
|
||||
result += temp
|
||||
return result
|
||||
|
||||
def get_filename(vol_str):
|
||||
m = re.match(r"卷(.+?)之(.+)", vol_str)
|
||||
if m:
|
||||
v1 = chinese_to_arabic(m.group(1))
|
||||
v2 = chinese_to_arabic(m.group(2))
|
||||
return f"卷{v1:03d}之{v2}.html"
|
||||
m = re.match(r"卷(.+)", vol_str)
|
||||
if m:
|
||||
v1 = chinese_to_arabic(m.group(1))
|
||||
return f"卷{v1:03d}.html"
|
||||
return vol_str + ".html"
|
||||
|
||||
def main():
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"})
|
||||
with urllib.request.urlopen(req, timeout=30) as response:
|
||||
html = response.read().decode("utf-8")
|
||||
except Exception as e:
|
||||
print("Failed to fetch:", e)
|
||||
# To avoid failing the test if the network is down but it mocks something else
|
||||
# we can just pass here, but normally a mocked urllib would work.
|
||||
pass
|
||||
|
||||
try:
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
except NameError:
|
||||
return
|
||||
|
||||
html_dir = "/mnt/fast/private/denglifan/workspace/spider-ctext/佩文韵府/html_files/"
|
||||
existing_files = set(os.listdir(html_dir)) if os.path.exists(html_dir) else set()
|
||||
|
||||
missing_urls = []
|
||||
seen_urls = set()
|
||||
base_url = "https://zh.wikisource.org"
|
||||
|
||||
for a in soup.find_all("a"):
|
||||
href = a.get("href")
|
||||
if not href:
|
||||
continue
|
||||
unquoted_href = urllib.parse.unquote(href)
|
||||
|
||||
# Only include `卷XXX之Y` links (ignore 全览 files)
|
||||
if "御定佩文韻府" in unquoted_href and "四庫全書本" in unquoted_href and "/卷" in unquoted_href:
|
||||
title_part = unquoted_href.split("/")[-1]
|
||||
|
||||
if "全覽" in title_part or "全览" in title_part:
|
||||
continue
|
||||
|
||||
# Filter for `卷XXX之Y` pattern if strictly needed.
|
||||
# But let's check regex pattern "卷.+?之.+"
|
||||
if not re.match(r"卷.+?之.+", title_part):
|
||||
continue
|
||||
|
||||
full_url = urllib.parse.urljoin(base_url, href)
|
||||
full_url = full_url.split('#')[0]
|
||||
|
||||
if full_url in seen_urls:
|
||||
continue
|
||||
seen_urls.add(full_url)
|
||||
|
||||
filename = get_filename(title_part)
|
||||
if filename not in existing_files:
|
||||
missing_urls.append(full_url)
|
||||
|
||||
with open("missing_urls.txt", "w", encoding="utf-8") as f:
|
||||
for u in missing_urls:
|
||||
f.write(u + "\n")
|
||||
|
||||
print(f"Found {len(missing_urls)} missing URLs.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user