initial commit

This commit is contained in:
lingdong huang
2020-08-06 13:32:21 -04:00
commit 86eb38d053
13 changed files with 524 additions and 0 deletions

7
.gitignore vendored Normal file
View File

@@ -0,0 +1,7 @@
.DS_Store
*/DS_Store
zdic.prc
zdic.txt
zdic_json/*
tools/mobi/*
zdic-cli-*

88
README.md Normal file
View File

@@ -0,0 +1,88 @@
# zdic-cli
An alternative, offline, regex-supporting, command-line interface to [zdic (漢典)](zdic.net), featuring:
- No internet connection required
- Full text search with regex: a command for finding characters/phrases in body text of definitions.
- Colorful text for highlighting entries
![](screenshots/screen000.png)
![](screenshots/screen001.png)
![](screenshots/screen002.png)
![](screenshots/screen003.png)
![](screenshots/screen004.png)
## Downloads
Standalone version available for windows, mac and linux. Please refer to the Releases page.
## Usage
zdic-cli runs as a REPL loop, and there are 5 types of commands you can enter:
```
def 甲 display entry for 甲
pre 甲 list words that starts with 甲
has 甲 乙 ... list words that contains 甲 and 乙 ...
txt 甲 list words whose full entry text contains 甲 (regex supported)
sel n display entry at previously returned list index n
```
`def` is probably the most common one for simply looking up characters/words. `pre` `has` `txt` will return an enumerated list listing all elligible entries, and `sel 0`/`sel 1`/`sel n` can be used to select from the list.
## Development setup
**This section is for compiling from source (and is somewhat complex), if you just would like to use the software, please check out the Releases page.**
### Dependencies
- node.js/npm
- python2. Tested on 2.7. It has to be python 2 instead of 3 because much of the data-processing work were done way back in a time when 2 was the norm. Sorry folks, but if you're a python3 purist feel free to send a PR!
- pkg (optional, to build standalone binaries for multiple platforms) `npm install -g pkg`
### Downloading and compiling the dictionary files
This repo does not contain the dictonary files themselves as they're too large. Instead, a shell script (`setup.sh`) is provided to automate the process:
- Automatically download the original Kindle dictionary format (.PRC) from the internet. The download link hardcoded in `setup.sh` might fail in the future, in which case a google search for `汉典.prc` should yield alternative resources.
- Automatically download a python library (kroo/mobi-python) for parsing mobi files. A rather old library and have some oddities, a find-and-replace script will be automatically run to patch some glitches in the source code :P
- `python/to_txt.py` is run to extract a raw `txt` file from the kindle format `prc`.
- `phthon/to_json.py` is run to generate a directory of `json` files from the `txt` to make lookup and formatting more efficient.
Run the shell script with:
```
sh setup.sh
```
At this point you'll have `zdic.prc`, `zdic.txt` and directory `zdic_json/`. Only `zdic_json` is needed, so if everything went well with the script, you can freely delete the other two and gain some 700MB of free space :)
### Compiling the binary
At this point you can also run the software by simply doing:
```
node index.js
```
You can also package it into a binary using
```
pkg .
```
provided you have the node and pkg dependencies installed.
Tip: to run the software by typing `zdic` anywhere, you can symlink it to `/user/bin`, e.g.
```
ln -s /user/bin/zdic path/to/zdic-cli/zdic-cli
```

259
index.js Normal file
View File

@@ -0,0 +1,259 @@
const fs = require("fs")
const readline = require('readline');
const dict_path = __dirname+'/zdic_json'
function locate_file(word){
const folder = dict_path;
var files = fs.readdirSync(folder)
for (var i = 0; i < files.length; i++){
if (files[i].includes(word.slice(0,1))){
return files[i]
}
}
return "extended.json"
}
function read_json(file_path){
return JSON.parse(fs.readFileSync(dict_path+"/"+file_path).toString())
}
function define(word){
var dict_path = locate_file(word)
var dict = read_json(dict_path)
var ret = []
if (word in dict){
return [word,dict[word]]
}
return null;
}
function starts_with(word){
var ret = []
var dict = read_json(locate_file(word))
for (w in dict){
if (w.startsWith(word)){
ret.push(w)
}
}
return ret;
}
function contains(words){
// console.log(words)
var ret = []
var files = fs.readdirSync(dict_path)
for (var i = 0; i < files.length; i++){
if (!files[i].endsWith(".json")){
continue;
}
var dict = read_json(files[i]);
for (var w in dict){
var ok = true;
for (var j = 0; j < words.length; j++){
if (!w.includes(words[j])){
ok = false;
break;
}
}
if (ok){
ret.push(w);
}
}
}
return ret;
}
function full_text_search(word,view_len,callback){
var ret = []
var pd = Math.floor((view_len-word.length)/2);
var files = fs.readdirSync(dict_path)
var re = new RegExp(word);
for (var i = 0; i < files.length; i++){
if (!files[i].endsWith(".json")){
continue;
}
var dict = read_json(files[i]);
for (var w in dict){
for (var j = 0; j < dict[w]['DEF'].length; j++){
var idx = dict[w]['DEF'][j].search(re)
if (idx == -1){
continue;
}
var it = [ret.length,w,dict[w]['DEF'][j].slice(Math.max(idx-pd,0),idx+word.length+pd)];
ret.push(w);
callback(it);
break;
}
}
}
return ret;
}
function render_def(word,entry){
var result = ""
result += "\x1b[32m\033[1m"+word+"\x1b[0m"
if (entry['TRD'] != ''){
result += " \x1b[32m\033[1m("+entry['TRD']+")\x1b[0m "
}
if (entry['PRN'][0] != ''){
result += " \x1b[33m[ "+entry['PRN'][0].trim()+""
}
if (entry['PRN'][1] != ''){
result += " , "+entry['PRN'][1].trim()
}
if (entry['PRN'][0] != ''){
result += " ]\x1b[0m"
}
result += "\n"
var n = process.stdout.columns-4;
for (var i = 0; i < entry['DEF'].length; i++){
result += "\x1b[2m\x1b[0m"
var j = 1;
var t = entry['DEF'][i];
var isf = true;
for (var c of t){
if (c == "《"){
result += "\x1b[31m"
}
if (c == ""){
result += "\x1b[33m"+c+"\x1b[0m";
if (!isf){
result += "\x1b[2m"
}
}else{
result += c;
}
if (c == "》"){
result += "\x1b[0m"
if (!isf){
result += "\x1b[2m"
}
}
if (c == "。" && isf){
result += "\x1b[2m"
isf = false;
}
j+=2;
if (j >= n){
j = 2;
result += "\n  "
}
}
result+="\x1b[0m\n"
}
result += ""
return result
}
function char_pad(x,n){
return (x+" ".repeat(n)).slice(0,n);
}
function render_item(i,x,n){
return `\x1b[33m${i.toString().padStart(4)}\x1b[0m ${char_pad(x,n)}`;
}
function render_list(lst){
var npi = 6;
var npl = Math.floor((process.stdout.columns-2)/(npi*2+5))
var result = "";
for (var i = 0; i < lst.length; i++){
var li = lst[i];
li = li.replace(/[^一-鿿]/g,"")
if (!li.length){//oops
li = lst[i];
}
result += render_item(i,li,npi);
if (i % npl == npl-1){
result += "\n"
}else{
result += ""
}
}
return result;
}
var commands = {
"def":["def x ","display entry for x"],
"pre":["pre x ","list words that starts with x"],
"has":["has x y ...","list words that contains x and y ..."],
"txt":["txt x ","list words whose full entry text contains x (regex supported)"],
"sel":["sel n ","display entry at previously returned list index n"],
}
function main(prev){
var curr = null;
var def = null;
var fail = false;
const rl = readline.createInterface({
input: process.stdin,
output: process.stdout
});
rl.question('> ', (answer) => {
try{
answer = answer.trim();
var cmd = answer.slice(0,3);
var arg = answer.slice(4);
if (!arg.length){
console.log(`\x1b[36m${commands[cmd][0]} \x1b[0m${commands[cmd][1]}`);
}else{
if (cmd == "def"){
var def = define(arg);
if (def){
console.log(render_def(...def));
}
}else if (cmd == "pre"){
curr = starts_with(arg);
console.log(render_list(curr));
}else if (cmd == "has"){
curr = contains(arg.split(" "));
console.log(render_list(curr));
}else if (cmd == "txt"){
curr = full_text_search(arg,Math.floor((process.stdout.columns-20)/2),function(x){
console.log(`${render_item(x[0],x[1],6)}\x1b[2m${x[2]}\x1b[0m`);
});
}else if (cmd == "sel"){
var def = define(prev[parseInt(arg)]);
if (def){
console.log(render_def(...def))
}
}else{
console.log("\x1b[31munsupported command.\x1b[0m")
fail = true
}
if (!fail && (def == null && (curr == null || curr.length == 0))){
console.log("\x1b[2m(0 result returned)\x1b[0m")
}
}
}catch(e){
console.log("\x1b[31mcommand parse failed. trying as direct query...\x1b[0m")
// console.log(e)
try{
var def = define(answer);
if (def){
console.log(render_def(...def));
}else{
throw new Error();
}
}catch(ee){
console.log("\x1b[31mcommand parse totally failed.\x1b[0m")
}
}
rl.close()
main(curr||prev);
});
}
console.log("╔═════════════════════════════════════════════╗")
console.log("║\x1b[31m 漢 典 CLI \x1b[0m║")
console.log("║Unoffical offline 漢典 (zdic.net) commandline║")
console.log("║\x1b[2m w/ data derived from `汉典.prc` (for Kindle)\x1b[0m║")
console.log("║\x1b[2m Lingdong Huang 2020 \x1b[0m║");
console.log("╚═════════════════════════════════════════════╝")
console.log(`commands: ${Object.keys(commands).map(x=>( "\x1b[36m"+x+"\x1b[0m" )).join(",") }, run without arguments to see help`);
main();

7
package.json Normal file
View File

@@ -0,0 +1,7 @@
{
"name":"zdic-cli",
"pkg": {
"assets": "zdic_json/*"
},
"bin":"index.js"
}

BIN
screenshots/screen000.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 226 KiB

BIN
screenshots/screen001.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 370 KiB

BIN
screenshots/screen002.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 413 KiB

BIN
screenshots/screen003.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 407 KiB

BIN
screenshots/screen004.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 454 KiB

25
setup.sh Normal file
View File

@@ -0,0 +1,25 @@
echo "downloading the dictionary..."
curl https://blog.xjpvictor.info/wp-content/uploads/汉典.prc > zdic.prc
cd tools
echo "downloading dependencies..."
git clone https://github.com/kroo/mobi-python
cp -r mobi-python/mobi ./mobi
rm -rf mobi-python
echo "patching bugs in dependencies..."
chmod +x patch.py
./patch.py
echo "converting dictionary to txt... (this might take a while ~10 mins)"
chmod +x to_txt.py
./to_txt.py > ../zdic.txt
echo "converting txt to json..."
mkdir ../zdic_json
chmod +x to_json.py
./to_json.py
echo "done setting up!"
echo "to use the app, either run 'pkg .' to package into a binary, or run 'node index.js' directly."

16
tools/patch.py Executable file
View File

@@ -0,0 +1,16 @@
#!/usr/bin/env python2.7
t = open("mobi/__init__.py",'r').read().replace(
"uncompress_lz77(self.contents[self.records[recordnum]['record Data Offset']:self.records[recordnum+1]['record Data Offset']-self.config['mobi']['extra bytes']])",
"result = uncompress_lz77(self.contents[self.records[recordnum]['record Data Offset']:self.records[recordnum+2]['record Data Offset']-self.config['mobi']['extra bytes']])"
).replace(
"for record in range(1, self.config['mobi']['First Non-book index'] - 1):",
"for record in range(0, 10000000,1):"
)
open("mobi/__init__.py",'w').write(t)
t = open("mobi/lz77.py",'r').read().replace(
"print(\"WARNING:","#print(\"WARNING:"
).replace(
"\" beginning of text!",
"#"
)
open("mobi/lz77.py",'w').write(t)

114
tools/to_json.py Executable file
View File

@@ -0,0 +1,114 @@
#!/usr/bin/env python2.7
# -*- coding: utf-8 -*-
import os
import re
import io
import sys
import json
reload(sys)
sys.setdefaultencoding('utf8')
txt = open("../zdic.txt",'r').read()
ent = re.findall(r'<h2>([^<>]*?)</h2>(.*?)<mbp:pagebreak/>', txt)
ent = [(x[0].strip(),x[1]) for x in ent]
ent = [x for x in ent if x[0] != ""]
def first(l):
return "" if len(l) == 0 else l[0]
def rem_bad_char(t):
return t.decode('utf-8','ignore')
def heads_to_name(t):
u = [x for x in t if len(hex(ord(x)))-2 <= 4]
u = ("".join(u)).decode("utf-8","ignore")
#print u
return u
def append_if_ok(heads,word):
try:
fn = "../zdic_json/"+heads_to_name(heads+[unicode(word)[0].lower()])+".json"
open(fn,'wb').write("test.")
os.remove(fn)
heads.append(unicode(word)[0].lower())
return True
except:
print("BAD WORD NAME:",word)
return False
heads = []
result = {}
weirdo = {}
for e in ent:
word = e[0]
content = e[1]
print word,
is_ok = True
if len(heads) == 0:
heads = []
is_ok = append_if_ok(heads,word)
else:
try:
b = not (word.lower()).startswith(heads[-1].lower())
except:
continue
if b:
if len(result) > 1000 or len(heads) >= 128:
fn = heads_to_name(heads)
open("../zdic_json/"+fn+".json",'wb').write(json.dumps(result))
result = {}
heads = []
is_ok = append_if_ok(heads,word)
else:
is_ok = append_if_ok(heads,word)
traditional = first(re.findall(r"#444\">\((.*?)\)",content)) + first(re.findall(r"繁体字:(.*?)</li>",content))
pinyin = first(re.findall(r"拼音.*?(.*?)[<\t ]",content))
zhuyin = first(re.findall(r"注音.*?(.*?)<",content))
definition = re.findall(r"<li.*?>(.*?)</li>",content)
if len(definition) == 0:
definition = [content]
definition = [re.sub(r"<.*?>","",d).strip() for d in definition]
definition = [re.sub(r"^.{0,1}\d.*?[\.\)]","",d).strip() for d in definition]
definition = [re.sub(r"===汉英互译===","",d).strip() for d in definition]
#definition = [re.sub(r"【解释】:","",d).strip() for d in definition]
traditional = re.sub(r"<.*?>","",traditional).replace(" ","")
definition = [rem_bad_char(d) for d in definition
if (len(pinyin) == 0 or (pinyin not in d and pinyin.replace(" ","") not in d)) \
and (len(zhuyin) == 0 or zhuyin not in d ) \
and "繁体" not in d
and "简体" not in d
and "郑码" not in d
and "拼音" not in d
and "粤语:" not in d
and "潮州话:" not in d
and "UniCode" not in d
and "" not in d
and len(d.replace(word,"")) > 0
]
if len(definition) == 0:
definition = [rem_bad_char(content)]
definition = [re.sub(r"<.*?>","",d).strip() for d in definition]
thing = {"TRD":traditional.decode('utf-8',"ignore"),
"PRN":[pinyin.decode('utf-8',"ignore"), zhuyin.decode('utf-8',"ignore")],
"DEF":definition,
}
if is_ok:
result[word]=thing
else:
weirdo[word]=thing
open("../zdic_json/extended.json",'wb').write(json.dumps(weirdo))

8
tools/to_txt.py Executable file
View File

@@ -0,0 +1,8 @@
#!/usr/bin/env python2.7
from mobi import Mobi
book = Mobi("../zdic.prc");
book.parse();
for record in book:
print record,