spider-ctext/初学记/test_parse.py

import bs4
import os
import json
import re

html_dir = "epub_extracted/OPS"


def parse_html(filepath):
    with open(filepath, "r", encoding="utf-8") as f:
        soup = bs4.BeautifulSoup(f.read(), "html.parser")

    poem_divs = soup.find_all("div", class_="poem")
    texts = []
    for div in poem_divs:
        for p in div.find_all("p"):
            # We want to process the text inside <p> while respecting <br/> as separators.
            # But actually, inside a <p>, there are text nodes, <span>, <small>, <br/>, etc.
            current_line = []
            for child in p.children:
                if child.name == "br":
                    texts.append("".join(current_line).strip())
                    current_line = []
                else:
                    current_line.append(child.get_text())
            if current_line:
                texts.append("".join(current_line).strip())
    return texts


texts = parse_html(
    os.path.join(html_dir, "c1_chu_xue_ji__si_ku_quan_shu_ben__juan01.xhtml")
)
for i, t in enumerate(texts[:50]):
    print(f"{i}: {t}")