Files
zdic-cli/tools/to_json.py
lingdong huang 86eb38d053 initial commit
2020-08-06 13:32:21 -04:00

115 lines
3.3 KiB
Python
Executable File
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python2.7
# -*- coding: utf-8 -*-
import os
import re
import io
import sys
import json
reload(sys)
sys.setdefaultencoding('utf8')
txt = open("../zdic.txt",'r').read()
ent = re.findall(r'<h2>([^<>]*?)</h2>(.*?)<mbp:pagebreak/>', txt)
ent = [(x[0].strip(),x[1]) for x in ent]
ent = [x for x in ent if x[0] != ""]
def first(l):
return "" if len(l) == 0 else l[0]
def rem_bad_char(t):
return t.decode('utf-8','ignore')
def heads_to_name(t):
u = [x for x in t if len(hex(ord(x)))-2 <= 4]
u = ("".join(u)).decode("utf-8","ignore")
#print u
return u
def append_if_ok(heads,word):
try:
fn = "../zdic_json/"+heads_to_name(heads+[unicode(word)[0].lower()])+".json"
open(fn,'wb').write("test.")
os.remove(fn)
heads.append(unicode(word)[0].lower())
return True
except:
print("BAD WORD NAME:",word)
return False
heads = []
result = {}
weirdo = {}
for e in ent:
word = e[0]
content = e[1]
print word,
is_ok = True
if len(heads) == 0:
heads = []
is_ok = append_if_ok(heads,word)
else:
try:
b = not (word.lower()).startswith(heads[-1].lower())
except:
continue
if b:
if len(result) > 1000 or len(heads) >= 128:
fn = heads_to_name(heads)
open("../zdic_json/"+fn+".json",'wb').write(json.dumps(result))
result = {}
heads = []
is_ok = append_if_ok(heads,word)
else:
is_ok = append_if_ok(heads,word)
traditional = first(re.findall(r"#444\">\((.*?)\)",content)) + first(re.findall(r"繁体字:(.*?)</li>",content))
pinyin = first(re.findall(r"拼音.*?(.*?)[<\t ]",content))
zhuyin = first(re.findall(r"注音.*?(.*?)<",content))
definition = re.findall(r"<li.*?>(.*?)</li>",content)
if len(definition) == 0:
definition = [content]
definition = [re.sub(r"<.*?>","",d).strip() for d in definition]
definition = [re.sub(r"^.{0,1}\d.*?[\.\)]","",d).strip() for d in definition]
definition = [re.sub(r"===汉英互译===","",d).strip() for d in definition]
#definition = [re.sub(r"【解释】:","",d).strip() for d in definition]
traditional = re.sub(r"<.*?>","",traditional).replace(" ","")
definition = [rem_bad_char(d) for d in definition
if (len(pinyin) == 0 or (pinyin not in d and pinyin.replace(" ","") not in d)) \
and (len(zhuyin) == 0 or zhuyin not in d ) \
and "繁体" not in d
and "简体" not in d
and "郑码" not in d
and "拼音" not in d
and "粤语:" not in d
and "潮州话:" not in d
and "UniCode" not in d
and "" not in d
and len(d.replace(word,"")) > 0
]
if len(definition) == 0:
definition = [rem_bad_char(content)]
definition = [re.sub(r"<.*?>","",d).strip() for d in definition]
thing = {"TRD":traditional.decode('utf-8',"ignore"),
"PRN":[pinyin.decode('utf-8',"ignore"), zhuyin.decode('utf-8',"ignore")],
"DEF":definition,
}
if is_ok:
result[word]=thing
else:
weirdo[word]=thing
open("../zdic_json/extended.json",'wb').write(json.dumps(weirdo))