initial commit
This commit is contained in:
16
tools/patch.py
Executable file
16
tools/patch.py
Executable file
@@ -0,0 +1,16 @@
|
||||
#!/usr/bin/env python2.7
|
||||
t = open("mobi/__init__.py",'r').read().replace(
|
||||
"uncompress_lz77(self.contents[self.records[recordnum]['record Data Offset']:self.records[recordnum+1]['record Data Offset']-self.config['mobi']['extra bytes']])",
|
||||
"result = uncompress_lz77(self.contents[self.records[recordnum]['record Data Offset']:self.records[recordnum+2]['record Data Offset']-self.config['mobi']['extra bytes']])"
|
||||
).replace(
|
||||
"for record in range(1, self.config['mobi']['First Non-book index'] - 1):",
|
||||
"for record in range(0, 10000000,1):"
|
||||
)
|
||||
open("mobi/__init__.py",'w').write(t)
|
||||
t = open("mobi/lz77.py",'r').read().replace(
|
||||
"print(\"WARNING:","#print(\"WARNING:"
|
||||
).replace(
|
||||
"\" beginning of text!",
|
||||
"#"
|
||||
)
|
||||
open("mobi/lz77.py",'w').write(t)
|
||||
114
tools/to_json.py
Executable file
114
tools/to_json.py
Executable file
@@ -0,0 +1,114 @@
|
||||
#!/usr/bin/env python2.7
|
||||
# -*- coding: utf-8 -*-
|
||||
import os
|
||||
import re
|
||||
import io
|
||||
import sys
|
||||
import json
|
||||
|
||||
reload(sys)
|
||||
sys.setdefaultencoding('utf8')
|
||||
|
||||
txt = open("../zdic.txt",'r').read()
|
||||
ent = re.findall(r'<h2>([^<>]*?)</h2>(.*?)<mbp:pagebreak/>', txt)
|
||||
ent = [(x[0].strip(),x[1]) for x in ent]
|
||||
ent = [x for x in ent if x[0] != ""]
|
||||
|
||||
def first(l):
|
||||
return "" if len(l) == 0 else l[0]
|
||||
|
||||
def rem_bad_char(t):
|
||||
return t.decode('utf-8','ignore')
|
||||
|
||||
def heads_to_name(t):
|
||||
u = [x for x in t if len(hex(ord(x)))-2 <= 4]
|
||||
u = ("".join(u)).decode("utf-8","ignore")
|
||||
#print u
|
||||
return u
|
||||
|
||||
def append_if_ok(heads,word):
|
||||
try:
|
||||
fn = "../zdic_json/"+heads_to_name(heads+[unicode(word)[0].lower()])+".json"
|
||||
open(fn,'wb').write("test.")
|
||||
os.remove(fn)
|
||||
heads.append(unicode(word)[0].lower())
|
||||
return True
|
||||
except:
|
||||
print("BAD WORD NAME:",word)
|
||||
return False
|
||||
|
||||
heads = []
|
||||
result = {}
|
||||
weirdo = {}
|
||||
for e in ent:
|
||||
|
||||
word = e[0]
|
||||
content = e[1]
|
||||
print word,
|
||||
|
||||
is_ok = True
|
||||
|
||||
if len(heads) == 0:
|
||||
heads = []
|
||||
is_ok = append_if_ok(heads,word)
|
||||
|
||||
else:
|
||||
try:
|
||||
b = not (word.lower()).startswith(heads[-1].lower())
|
||||
except:
|
||||
continue
|
||||
if b:
|
||||
if len(result) > 1000 or len(heads) >= 128:
|
||||
fn = heads_to_name(heads)
|
||||
open("../zdic_json/"+fn+".json",'wb').write(json.dumps(result))
|
||||
|
||||
result = {}
|
||||
heads = []
|
||||
is_ok = append_if_ok(heads,word)
|
||||
else:
|
||||
is_ok = append_if_ok(heads,word)
|
||||
|
||||
|
||||
traditional = first(re.findall(r"#444\">\((.*?)\)",content)) + first(re.findall(r"繁体字:(.*?)</li>",content))
|
||||
pinyin = first(re.findall(r"拼音.*?:(.*?)[<\t ]",content))
|
||||
zhuyin = first(re.findall(r"注音.*?:(.*?)<",content))
|
||||
definition = re.findall(r"<li.*?>(.*?)</li>",content)
|
||||
if len(definition) == 0:
|
||||
definition = [content]
|
||||
|
||||
definition = [re.sub(r"<.*?>","",d).strip() for d in definition]
|
||||
definition = [re.sub(r"^.{0,1}\d.*?[\.\)]","",d).strip() for d in definition]
|
||||
definition = [re.sub(r"===汉英互译===","",d).strip() for d in definition]
|
||||
#definition = [re.sub(r"【解释】:","",d).strip() for d in definition]
|
||||
traditional = re.sub(r"<.*?>","",traditional).replace(" ","")
|
||||
|
||||
definition = [rem_bad_char(d) for d in definition
|
||||
if (len(pinyin) == 0 or (pinyin not in d and pinyin.replace(" ","") not in d)) \
|
||||
and (len(zhuyin) == 0 or zhuyin not in d ) \
|
||||
and "繁体" not in d
|
||||
and "简体" not in d
|
||||
and "郑码" not in d
|
||||
and "拼音" not in d
|
||||
and "粤语:" not in d
|
||||
and "潮州话:" not in d
|
||||
and "UniCode" not in d
|
||||
and "◎" not in d
|
||||
and len(d.replace(word,"")) > 0
|
||||
]
|
||||
|
||||
if len(definition) == 0:
|
||||
definition = [rem_bad_char(content)]
|
||||
definition = [re.sub(r"<.*?>","",d).strip() for d in definition]
|
||||
|
||||
|
||||
thing = {"TRD":traditional.decode('utf-8',"ignore"),
|
||||
"PRN":[pinyin.decode('utf-8',"ignore"), zhuyin.decode('utf-8',"ignore")],
|
||||
"DEF":definition,
|
||||
}
|
||||
if is_ok:
|
||||
result[word]=thing
|
||||
else:
|
||||
weirdo[word]=thing
|
||||
|
||||
open("../zdic_json/extended.json",'wb').write(json.dumps(weirdo))
|
||||
|
||||
8
tools/to_txt.py
Executable file
8
tools/to_txt.py
Executable file
@@ -0,0 +1,8 @@
|
||||
#!/usr/bin/env python2.7
|
||||
from mobi import Mobi
|
||||
|
||||
book = Mobi("../zdic.prc");
|
||||
book.parse();
|
||||
|
||||
for record in book:
|
||||
print record,
|
||||
Reference in New Issue
Block a user