Update: 初学记、佩文韵府 and 五车韵瑞
This commit is contained in:
24
初学记/test_extract.py
Normal file
24
初学记/test_extract.py
Normal file
@@ -0,0 +1,24 @@
|
||||
import bs4
|
||||
import re
|
||||
import sys
|
||||
|
||||
def parse_html(filepath):
|
||||
with open(filepath, "r", encoding="utf-8") as f:
|
||||
soup = bs4.BeautifulSoup(f.read(), "html.parser")
|
||||
|
||||
current_section = None
|
||||
|
||||
poem_divs = soup.find_all("div", class_="poem")
|
||||
for div in poem_divs:
|
||||
# Before we process children, let's look at p tags
|
||||
for p in div.find_all("p"):
|
||||
for child in p.children:
|
||||
if child.name == "span" and child.get("id") and child.text.strip().endswith("部"):
|
||||
current_section = child.text.strip()
|
||||
print("Found Section:", current_section)
|
||||
elif child.name == "br":
|
||||
pass
|
||||
elif type(child) == bs4.element.NavigableString:
|
||||
pass
|
||||
|
||||
parse_html("epub_extracted/OPS/c1_chu_xue_ji__si_ku_quan_shu_ben__juan01.xhtml")
|
||||
Reference in New Issue
Block a user