spider-ctext/初学记/test_parse2.py

import bs4
import os
import json
import re

html_dir = "epub_extracted/OPS"


def parse_html(filepath):
    with open(filepath, "r", encoding="utf-8") as f:
        soup = bs4.BeautifulSoup(f.read(), "html.parser")

    poem_divs = soup.find_all("div", class_="poem")
    texts = []
    for div in poem_divs:
        for p in div.find_all("p"):
            current_line = []
            for child in p.children:
                if child.name == "br":
                    texts.append("".join(current_line).strip())
                    current_line = []
                else:
                    current_line.append(child.get_text())
            if current_line:
                texts.append("".join(current_line).strip())
    return texts


def extract_sections(entry_text):
    result = {"叙事": "", "事对": "", "诗文": ""}

    # Extract 叙事
    narrative_match = re.search(r"〈叙事〉(.*)", entry_text)
    if not narrative_match:
        return result

    rest_after_narrative = narrative_match.group(1)

    # Find 事对
    shìduì_start = rest_after_narrative.find("事對")
    if shìduì_start == -1:
        shìduì_start = rest_after_narrative.find("事对")

    if shìduì_start != -1:
        result["叙事"] = rest_after_narrative[:shìduì_start]
        rest_after_shiduì = rest_after_narrative[shìduì_start + 2 :]  # skip "事對"

        # Find 诗文 start
        # Match 〉 followed by a literary genre
        genre_pattern = r"〉(賦|詩|讚|表|碑|頌|銘|檄|文|啓|書|歌|曲|引|記|箴|七|連珠|弔|祭文|詔|令|誄|序|論|賛)"
        shiwen_match = re.search(genre_pattern, rest_after_shiduì)

        if shiwen_match:
            split_idx = shiwen_match.start() + 1  # keep the genre character
            result["事对"] = rest_after_shiduì[:split_idx]
            result["诗文"] = rest_after_shiduì[split_idx:]
        else:
            result["事对"] = rest_after_shiduì
    else:
        # No 事对
        genre_pattern = r"〉(賦|詩|讚|表|碑|頌|銘|檄|文|啓|書|歌|曲|引|記|箴|七|連珠|弔|祭文|詔|令|誄|序|論|賛)"
        shiwen_match = re.search(genre_pattern, rest_after_narrative)
        if shiwen_match:
            split_idx = shiwen_match.start() + 1
            result["叙事"] = rest_after_narrative[:split_idx]
            result["诗文"] = rest_after_narrative[split_idx:]
        else:
            result["叙事"] = rest_after_narrative

    return result


texts = parse_html(
    os.path.join(html_dir, "c1_chu_xue_ji__si_ku_quan_shu_ben__juan01.xhtml")
)
entries = []
for line in texts:
    if "〈叙事〉" in line:
        entry_name = line.split("〈叙事〉")[0]
        # remove "第X" from entry_name
        word_name = re.sub(
            r"第[一二三四五六七八九十百]+(?:[上下])?$", "", entry_name
        ).strip()
        sections = extract_sections(line)
        entries.append((word_name, entry_name, sections))

for e in entries[:3]:
    print(f"Word: {e[0]}")
    print(f"Entry: {e[1]}")
    print(f"叙事 len: {len(e[2]['叙事'])}")
    print(f"事对 len: {len(e[2]['事对'])}")
    print(f"诗文 len: {len(e[2]['诗文'])}")
    print("-" * 20)