Update: 初学记、佩文韵府 and 五车韵瑞
This commit is contained in:
32
五车韵瑞/parse_shidian.py
Normal file
32
五车韵瑞/parse_shidian.py
Normal file
@@ -0,0 +1,32 @@
|
||||
import re
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
url = "https://www.shidianguji.com/book/CADAL02059421/chapter/1lmkv0n02yhom?version=2"
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
|
||||
}
|
||||
response = requests.get(url, headers=headers)
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
|
||||
# Check script tags for "__INIT_DATA__" or similar state hydration
|
||||
scripts = soup.find_all('script')
|
||||
for s in scripts:
|
||||
if s.string and ('__INIT_DATA__' in s.string or 'window.__INITIAL_STATE__' in s.string):
|
||||
print(f"Found init state data of length: {len(s.string)}")
|
||||
print(s.string[:500])
|
||||
|
||||
# Check normal text elements
|
||||
content = soup.find_all('p')
|
||||
print(f"Found {len(content)} paragraphs.")
|
||||
if content:
|
||||
for p in content[:5]:
|
||||
print(p.text)
|
||||
|
||||
print("\n--- Let's look at another part ---")
|
||||
# Try extracting text directly
|
||||
text = soup.get_text()
|
||||
# Find the title or some known text like "五车韵瑞"
|
||||
idx = text.find("五车韵瑞")
|
||||
if idx != -1:
|
||||
print(text[idx:idx+500])
|
||||
Reference in New Issue
Block a user