spider-ctext/佩文韵府/test_strip.py

import json
import re

with open('peiwenyunfu.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

prefixes = ["韻藻", "增"]

mismatches = []
total_pipes = 0

for rhyme, r_data in data.items():
    if rhyme in ["metadata", "preface"]:
        continue
    for word, content in r_data.get("词条", {}).items():
        clean_word = word
        for p in prefixes:
            if clean_word.startswith(p) and len(clean_word) > len(p):
                clean_word = clean_word[len(p):]

        pipe_count = content.count("丨")
        if pipe_count == 0:
            continue
        total_pipes += 1
        if pipe_count % len(clean_word) != 0:
            mismatches.append((word, clean_word, pipe_count, content))

print(f"Total entries with pipes: {total_pipes}")
print(f"Total mismatches after stripping: {len(mismatches)}")
for w, cw, p, c in mismatches[:10]:
    print(f"{w} -> {cw} (len {len(cw)}), pipes: {p}")
    print(f"  {c}")