Update: 初学记、佩文韵府 and 五车韵瑞

2026-03-22 16:18:35 +08:00
parent df475fd03f
commit 183b842090
553 changed files with 754048 additions and 169 deletions
--- a/五车韵瑞/parse_shidian.py
+++ b/五车韵瑞/parse_shidian.py
@@ -0,0 +1,32 @@
+import re
+import requests
+from bs4 import BeautifulSoup
+
+url = "https://www.shidianguji.com/book/CADAL02059421/chapter/1lmkv0n02yhom?version=2"
+headers = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
+}
+response = requests.get(url, headers=headers)
+soup = BeautifulSoup(response.text, 'html.parser')
+
+# Check script tags for "__INIT_DATA__" or similar state hydration
+scripts = soup.find_all('script')
+for s in scripts:
+    if s.string and ('__INIT_DATA__' in s.string or 'window.__INITIAL_STATE__' in s.string):
+        print(f"Found init state data of length: {len(s.string)}")
+        print(s.string[:500])
+
+# Check normal text elements
+content = soup.find_all('p')
+print(f"Found {len(content)} paragraphs.")
+if content:
+    for p in content[:5]:
+        print(p.text)
+
+print("\n--- Let's look at another part ---")
+# Try extracting text directly
+text = soup.get_text()
+# Find the title or some known text like "五车韵瑞"
+idx = text.find("五车韵瑞")
+if idx != -1:
+    print(text[idx:idx+500])