initial commit

This commit is contained in:
lingdong huang
2020-08-06 13:32:21 -04:00
commit 86eb38d053
13 changed files with 524 additions and 0 deletions

114
tools/to_json.py Executable file
View File

@@ -0,0 +1,114 @@
#!/usr/bin/env python2.7
# -*- coding: utf-8 -*-
import os
import re
import io
import sys
import json
reload(sys)
sys.setdefaultencoding('utf8')
txt = open("../zdic.txt",'r').read()
ent = re.findall(r'<h2>([^<>]*?)</h2>(.*?)<mbp:pagebreak/>', txt)
ent = [(x[0].strip(),x[1]) for x in ent]
ent = [x for x in ent if x[0] != ""]
def first(l):
return "" if len(l) == 0 else l[0]
def rem_bad_char(t):
return t.decode('utf-8','ignore')
def heads_to_name(t):
u = [x for x in t if len(hex(ord(x)))-2 <= 4]
u = ("".join(u)).decode("utf-8","ignore")
#print u
return u
def append_if_ok(heads,word):
try:
fn = "../zdic_json/"+heads_to_name(heads+[unicode(word)[0].lower()])+".json"
open(fn,'wb').write("test.")
os.remove(fn)
heads.append(unicode(word)[0].lower())
return True
except:
print("BAD WORD NAME:",word)
return False
heads = []
result = {}
weirdo = {}
for e in ent:
word = e[0]
content = e[1]
print word,
is_ok = True
if len(heads) == 0:
heads = []
is_ok = append_if_ok(heads,word)
else:
try:
b = not (word.lower()).startswith(heads[-1].lower())
except:
continue
if b:
if len(result) > 1000 or len(heads) >= 128:
fn = heads_to_name(heads)
open("../zdic_json/"+fn+".json",'wb').write(json.dumps(result))
result = {}
heads = []
is_ok = append_if_ok(heads,word)
else:
is_ok = append_if_ok(heads,word)
traditional = first(re.findall(r"#444\">\((.*?)\)",content)) + first(re.findall(r"繁体字:(.*?)</li>",content))
pinyin = first(re.findall(r"拼音.*?(.*?)[<\t ]",content))
zhuyin = first(re.findall(r"注音.*?(.*?)<",content))
definition = re.findall(r"<li.*?>(.*?)</li>",content)
if len(definition) == 0:
definition = [content]
definition = [re.sub(r"<.*?>","",d).strip() for d in definition]
definition = [re.sub(r"^.{0,1}\d.*?[\.\)]","",d).strip() for d in definition]
definition = [re.sub(r"===汉英互译===","",d).strip() for d in definition]
#definition = [re.sub(r"【解释】:","",d).strip() for d in definition]
traditional = re.sub(r"<.*?>","",traditional).replace(" ","")
definition = [rem_bad_char(d) for d in definition
if (len(pinyin) == 0 or (pinyin not in d and pinyin.replace(" ","") not in d)) \
and (len(zhuyin) == 0 or zhuyin not in d ) \
and "繁体" not in d
and "简体" not in d
and "郑码" not in d
and "拼音" not in d
and "粤语:" not in d
and "潮州话:" not in d
and "UniCode" not in d
and "" not in d
and len(d.replace(word,"")) > 0
]
if len(definition) == 0:
definition = [rem_bad_char(content)]
definition = [re.sub(r"<.*?>","",d).strip() for d in definition]
thing = {"TRD":traditional.decode('utf-8',"ignore"),
"PRN":[pinyin.decode('utf-8',"ignore"), zhuyin.decode('utf-8',"ignore")],
"DEF":definition,
}
if is_ok:
result[word]=thing
else:
weirdo[word]=thing
open("../zdic_json/extended.json",'wb').write(json.dumps(weirdo))