initial commit
This commit is contained in:
7
.gitignore
vendored
Normal file
7
.gitignore
vendored
Normal file
@@ -0,0 +1,7 @@
|
||||
.DS_Store
|
||||
*/DS_Store
|
||||
zdic.prc
|
||||
zdic.txt
|
||||
zdic_json/*
|
||||
tools/mobi/*
|
||||
zdic-cli-*
|
||||
88
README.md
Normal file
88
README.md
Normal file
@@ -0,0 +1,88 @@
|
||||
# zdic-cli
|
||||
|
||||
An alternative, offline, regex-supporting, command-line interface to [zdic (漢典)](zdic.net), featuring:
|
||||
|
||||
- No internet connection required
|
||||
- Full text search with regex: a command for finding characters/phrases in body text of definitions.
|
||||
- Colorful text for highlighting entries
|
||||
|
||||
|
||||

|
||||

|
||||

|
||||

|
||||

|
||||
|
||||
|
||||
## Downloads
|
||||
|
||||
Standalone version available for windows, mac and linux. Please refer to the Releases page.
|
||||
|
||||
## Usage
|
||||
|
||||
zdic-cli runs as a REPL loop, and there are 5 types of commands you can enter:
|
||||
|
||||
```
|
||||
def 甲 display entry for 甲
|
||||
pre 甲 list words that starts with 甲
|
||||
has 甲 乙 ... list words that contains 甲 and 乙 ...
|
||||
txt 甲 list words whose full entry text contains 甲 (regex supported)
|
||||
sel n display entry at previously returned list index n
|
||||
```
|
||||
|
||||
`def` is probably the most common one for simply looking up characters/words. `pre` `has` `txt` will return an enumerated list listing all elligible entries, and `sel 0`/`sel 1`/`sel n` can be used to select from the list.
|
||||
|
||||
|
||||
## Development setup
|
||||
|
||||
**This section is for compiling from source (and is somewhat complex), if you just would like to use the software, please check out the Releases page.**
|
||||
|
||||
### Dependencies
|
||||
|
||||
- node.js/npm
|
||||
- python2. Tested on 2.7. It has to be python 2 instead of 3 because much of the data-processing work were done way back in a time when 2 was the norm. Sorry folks, but if you're a python3 purist feel free to send a PR!
|
||||
- pkg (optional, to build standalone binaries for multiple platforms) `npm install -g pkg`
|
||||
|
||||
### Downloading and compiling the dictionary files
|
||||
|
||||
This repo does not contain the dictonary files themselves as they're too large. Instead, a shell script (`setup.sh`) is provided to automate the process:
|
||||
|
||||
- Automatically download the original Kindle dictionary format (.PRC) from the internet. The download link hardcoded in `setup.sh` might fail in the future, in which case a google search for `汉典.prc` should yield alternative resources.
|
||||
- Automatically download a python library (kroo/mobi-python) for parsing mobi files. A rather old library and have some oddities, a find-and-replace script will be automatically run to patch some glitches in the source code :P
|
||||
- `python/to_txt.py` is run to extract a raw `txt` file from the kindle format `prc`.
|
||||
- `phthon/to_json.py` is run to generate a directory of `json` files from the `txt` to make lookup and formatting more efficient.
|
||||
|
||||
Run the shell script with:
|
||||
|
||||
```
|
||||
sh setup.sh
|
||||
```
|
||||
|
||||
At this point you'll have `zdic.prc`, `zdic.txt` and directory `zdic_json/`. Only `zdic_json` is needed, so if everything went well with the script, you can freely delete the other two and gain some 700MB of free space :)
|
||||
|
||||
### Compiling the binary
|
||||
|
||||
At this point you can also run the software by simply doing:
|
||||
|
||||
```
|
||||
node index.js
|
||||
```
|
||||
|
||||
You can also package it into a binary using
|
||||
|
||||
```
|
||||
pkg .
|
||||
```
|
||||
|
||||
provided you have the node and pkg dependencies installed.
|
||||
|
||||
Tip: to run the software by typing `zdic` anywhere, you can symlink it to `/user/bin`, e.g.
|
||||
|
||||
```
|
||||
ln -s /user/bin/zdic path/to/zdic-cli/zdic-cli
|
||||
```
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
259
index.js
Normal file
259
index.js
Normal file
@@ -0,0 +1,259 @@
|
||||
const fs = require("fs")
|
||||
const readline = require('readline');
|
||||
|
||||
const dict_path = __dirname+'/zdic_json'
|
||||
|
||||
function locate_file(word){
|
||||
const folder = dict_path;
|
||||
var files = fs.readdirSync(folder)
|
||||
for (var i = 0; i < files.length; i++){
|
||||
if (files[i].includes(word.slice(0,1))){
|
||||
return files[i]
|
||||
}
|
||||
}
|
||||
return "extended.json"
|
||||
}
|
||||
|
||||
function read_json(file_path){
|
||||
return JSON.parse(fs.readFileSync(dict_path+"/"+file_path).toString())
|
||||
}
|
||||
|
||||
function define(word){
|
||||
var dict_path = locate_file(word)
|
||||
var dict = read_json(dict_path)
|
||||
var ret = []
|
||||
if (word in dict){
|
||||
return [word,dict[word]]
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
function starts_with(word){
|
||||
var ret = []
|
||||
var dict = read_json(locate_file(word))
|
||||
for (w in dict){
|
||||
if (w.startsWith(word)){
|
||||
ret.push(w)
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
function contains(words){
|
||||
// console.log(words)
|
||||
var ret = []
|
||||
var files = fs.readdirSync(dict_path)
|
||||
for (var i = 0; i < files.length; i++){
|
||||
if (!files[i].endsWith(".json")){
|
||||
continue;
|
||||
}
|
||||
var dict = read_json(files[i]);
|
||||
for (var w in dict){
|
||||
var ok = true;
|
||||
for (var j = 0; j < words.length; j++){
|
||||
if (!w.includes(words[j])){
|
||||
ok = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (ok){
|
||||
ret.push(w);
|
||||
}
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
function full_text_search(word,view_len,callback){
|
||||
var ret = []
|
||||
var pd = Math.floor((view_len-word.length)/2);
|
||||
var files = fs.readdirSync(dict_path)
|
||||
var re = new RegExp(word);
|
||||
for (var i = 0; i < files.length; i++){
|
||||
if (!files[i].endsWith(".json")){
|
||||
continue;
|
||||
}
|
||||
var dict = read_json(files[i]);
|
||||
for (var w in dict){
|
||||
for (var j = 0; j < dict[w]['DEF'].length; j++){
|
||||
var idx = dict[w]['DEF'][j].search(re)
|
||||
if (idx == -1){
|
||||
continue;
|
||||
}
|
||||
var it = [ret.length,w,dict[w]['DEF'][j].slice(Math.max(idx-pd,0),idx+word.length+pd)];
|
||||
ret.push(w);
|
||||
callback(it);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
function render_def(word,entry){
|
||||
var result = ""
|
||||
result += "\x1b[32m\033[1m"+word+"\x1b[0m"
|
||||
if (entry['TRD'] != ''){
|
||||
result += " \x1b[32m\033[1m("+entry['TRD']+")\x1b[0m "
|
||||
}
|
||||
if (entry['PRN'][0] != ''){
|
||||
result += " \x1b[33m[ "+entry['PRN'][0].trim()+""
|
||||
}
|
||||
if (entry['PRN'][1] != ''){
|
||||
result += " , "+entry['PRN'][1].trim()
|
||||
}
|
||||
if (entry['PRN'][0] != ''){
|
||||
result += " ]\x1b[0m"
|
||||
}
|
||||
result += "\n"
|
||||
var n = process.stdout.columns-4;
|
||||
|
||||
for (var i = 0; i < entry['DEF'].length; i++){
|
||||
result += "\x1b[2m〇\x1b[0m"
|
||||
var j = 1;
|
||||
var t = entry['DEF'][i];
|
||||
var isf = true;
|
||||
for (var c of t){
|
||||
if (c == "《"){
|
||||
result += "\x1b[31m"
|
||||
}
|
||||
if (c == "~"){
|
||||
result += "\x1b[33m"+c+"\x1b[0m";
|
||||
if (!isf){
|
||||
result += "\x1b[2m"
|
||||
}
|
||||
}else{
|
||||
result += c;
|
||||
}
|
||||
|
||||
if (c == "》"){
|
||||
result += "\x1b[0m"
|
||||
if (!isf){
|
||||
result += "\x1b[2m"
|
||||
}
|
||||
}
|
||||
if (c == "。" && isf){
|
||||
result += "\x1b[2m"
|
||||
isf = false;
|
||||
}
|
||||
j+=2;
|
||||
if (j >= n){
|
||||
j = 2;
|
||||
result += "\n "
|
||||
|
||||
}
|
||||
}
|
||||
result+="\x1b[0m\n"
|
||||
}
|
||||
result += ""
|
||||
return result
|
||||
}
|
||||
|
||||
function char_pad(x,n){
|
||||
return (x+" ".repeat(n)).slice(0,n);
|
||||
}
|
||||
function render_item(i,x,n){
|
||||
return `\x1b[33m${i.toString().padStart(4)}\x1b[0m ${char_pad(x,n)}`;
|
||||
}
|
||||
|
||||
function render_list(lst){
|
||||
var npi = 6;
|
||||
var npl = Math.floor((process.stdout.columns-2)/(npi*2+5))
|
||||
var result = "";
|
||||
for (var i = 0; i < lst.length; i++){
|
||||
var li = lst[i];
|
||||
li = li.replace(/[^一-鿿]/g,"")
|
||||
if (!li.length){//oops
|
||||
li = lst[i];
|
||||
}
|
||||
result += render_item(i,li,npi);
|
||||
if (i % npl == npl-1){
|
||||
result += "\n"
|
||||
}else{
|
||||
result += ""
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
var commands = {
|
||||
"def":["def x ","display entry for x"],
|
||||
"pre":["pre x ","list words that starts with x"],
|
||||
"has":["has x y ...","list words that contains x and y ..."],
|
||||
"txt":["txt x ","list words whose full entry text contains x (regex supported)"],
|
||||
"sel":["sel n ","display entry at previously returned list index n"],
|
||||
}
|
||||
|
||||
function main(prev){
|
||||
var curr = null;
|
||||
var def = null;
|
||||
var fail = false;
|
||||
const rl = readline.createInterface({
|
||||
input: process.stdin,
|
||||
output: process.stdout
|
||||
});
|
||||
rl.question('> ', (answer) => {
|
||||
try{
|
||||
answer = answer.trim();
|
||||
var cmd = answer.slice(0,3);
|
||||
var arg = answer.slice(4);
|
||||
if (!arg.length){
|
||||
console.log(`\x1b[36m${commands[cmd][0]} \x1b[0m${commands[cmd][1]}`);
|
||||
}else{
|
||||
if (cmd == "def"){
|
||||
var def = define(arg);
|
||||
if (def){
|
||||
console.log(render_def(...def));
|
||||
}
|
||||
}else if (cmd == "pre"){
|
||||
curr = starts_with(arg);
|
||||
console.log(render_list(curr));
|
||||
}else if (cmd == "has"){
|
||||
curr = contains(arg.split(" "));
|
||||
console.log(render_list(curr));
|
||||
}else if (cmd == "txt"){
|
||||
curr = full_text_search(arg,Math.floor((process.stdout.columns-20)/2),function(x){
|
||||
console.log(`${render_item(x[0],x[1],6)}\x1b[2m${x[2]}\x1b[0m`);
|
||||
});
|
||||
}else if (cmd == "sel"){
|
||||
var def = define(prev[parseInt(arg)]);
|
||||
if (def){
|
||||
console.log(render_def(...def))
|
||||
}
|
||||
}else{
|
||||
console.log("\x1b[31munsupported command.\x1b[0m")
|
||||
fail = true
|
||||
}
|
||||
if (!fail && (def == null && (curr == null || curr.length == 0))){
|
||||
console.log("\x1b[2m(0 result returned)\x1b[0m")
|
||||
}
|
||||
}
|
||||
}catch(e){
|
||||
console.log("\x1b[31mcommand parse failed. trying as direct query...\x1b[0m")
|
||||
// console.log(e)
|
||||
try{
|
||||
var def = define(answer);
|
||||
if (def){
|
||||
console.log(render_def(...def));
|
||||
}else{
|
||||
throw new Error();
|
||||
}
|
||||
}catch(ee){
|
||||
console.log("\x1b[31mcommand parse totally failed.\x1b[0m")
|
||||
}
|
||||
}
|
||||
rl.close()
|
||||
main(curr||prev);
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
console.log("╔═════════════════════════════════════════════╗")
|
||||
console.log("║\x1b[31m 漢 典 CLI \x1b[0m║")
|
||||
console.log("║Unoffical offline 漢典 (zdic.net) commandline║")
|
||||
console.log("║\x1b[2m w/ data derived from `汉典.prc` (for Kindle)\x1b[0m║")
|
||||
console.log("║\x1b[2m Lingdong Huang 2020 \x1b[0m║");
|
||||
console.log("╚═════════════════════════════════════════════╝")
|
||||
console.log(`commands: ${Object.keys(commands).map(x=>( "\x1b[36m"+x+"\x1b[0m" )).join(",") }, run without arguments to see help`);
|
||||
main();
|
||||
7
package.json
Normal file
7
package.json
Normal file
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"name":"zdic-cli",
|
||||
"pkg": {
|
||||
"assets": "zdic_json/*"
|
||||
},
|
||||
"bin":"index.js"
|
||||
}
|
||||
BIN
screenshots/screen000.png
Normal file
BIN
screenshots/screen000.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 226 KiB |
BIN
screenshots/screen001.png
Normal file
BIN
screenshots/screen001.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 370 KiB |
BIN
screenshots/screen002.png
Normal file
BIN
screenshots/screen002.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 413 KiB |
BIN
screenshots/screen003.png
Normal file
BIN
screenshots/screen003.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 407 KiB |
BIN
screenshots/screen004.png
Normal file
BIN
screenshots/screen004.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 454 KiB |
25
setup.sh
Normal file
25
setup.sh
Normal file
@@ -0,0 +1,25 @@
|
||||
echo "downloading the dictionary..."
|
||||
curl https://blog.xjpvictor.info/wp-content/uploads/汉典.prc > zdic.prc
|
||||
|
||||
cd tools
|
||||
|
||||
echo "downloading dependencies..."
|
||||
git clone https://github.com/kroo/mobi-python
|
||||
cp -r mobi-python/mobi ./mobi
|
||||
rm -rf mobi-python
|
||||
|
||||
echo "patching bugs in dependencies..."
|
||||
chmod +x patch.py
|
||||
./patch.py
|
||||
|
||||
echo "converting dictionary to txt... (this might take a while ~10 mins)"
|
||||
chmod +x to_txt.py
|
||||
./to_txt.py > ../zdic.txt
|
||||
|
||||
echo "converting txt to json..."
|
||||
mkdir ../zdic_json
|
||||
chmod +x to_json.py
|
||||
./to_json.py
|
||||
|
||||
echo "done setting up!"
|
||||
echo "to use the app, either run 'pkg .' to package into a binary, or run 'node index.js' directly."
|
||||
16
tools/patch.py
Executable file
16
tools/patch.py
Executable file
@@ -0,0 +1,16 @@
|
||||
#!/usr/bin/env python2.7
|
||||
t = open("mobi/__init__.py",'r').read().replace(
|
||||
"uncompress_lz77(self.contents[self.records[recordnum]['record Data Offset']:self.records[recordnum+1]['record Data Offset']-self.config['mobi']['extra bytes']])",
|
||||
"result = uncompress_lz77(self.contents[self.records[recordnum]['record Data Offset']:self.records[recordnum+2]['record Data Offset']-self.config['mobi']['extra bytes']])"
|
||||
).replace(
|
||||
"for record in range(1, self.config['mobi']['First Non-book index'] - 1):",
|
||||
"for record in range(0, 10000000,1):"
|
||||
)
|
||||
open("mobi/__init__.py",'w').write(t)
|
||||
t = open("mobi/lz77.py",'r').read().replace(
|
||||
"print(\"WARNING:","#print(\"WARNING:"
|
||||
).replace(
|
||||
"\" beginning of text!",
|
||||
"#"
|
||||
)
|
||||
open("mobi/lz77.py",'w').write(t)
|
||||
114
tools/to_json.py
Executable file
114
tools/to_json.py
Executable file
@@ -0,0 +1,114 @@
|
||||
#!/usr/bin/env python2.7
|
||||
# -*- coding: utf-8 -*-
|
||||
import os
|
||||
import re
|
||||
import io
|
||||
import sys
|
||||
import json
|
||||
|
||||
reload(sys)
|
||||
sys.setdefaultencoding('utf8')
|
||||
|
||||
txt = open("../zdic.txt",'r').read()
|
||||
ent = re.findall(r'<h2>([^<>]*?)</h2>(.*?)<mbp:pagebreak/>', txt)
|
||||
ent = [(x[0].strip(),x[1]) for x in ent]
|
||||
ent = [x for x in ent if x[0] != ""]
|
||||
|
||||
def first(l):
|
||||
return "" if len(l) == 0 else l[0]
|
||||
|
||||
def rem_bad_char(t):
|
||||
return t.decode('utf-8','ignore')
|
||||
|
||||
def heads_to_name(t):
|
||||
u = [x for x in t if len(hex(ord(x)))-2 <= 4]
|
||||
u = ("".join(u)).decode("utf-8","ignore")
|
||||
#print u
|
||||
return u
|
||||
|
||||
def append_if_ok(heads,word):
|
||||
try:
|
||||
fn = "../zdic_json/"+heads_to_name(heads+[unicode(word)[0].lower()])+".json"
|
||||
open(fn,'wb').write("test.")
|
||||
os.remove(fn)
|
||||
heads.append(unicode(word)[0].lower())
|
||||
return True
|
||||
except:
|
||||
print("BAD WORD NAME:",word)
|
||||
return False
|
||||
|
||||
heads = []
|
||||
result = {}
|
||||
weirdo = {}
|
||||
for e in ent:
|
||||
|
||||
word = e[0]
|
||||
content = e[1]
|
||||
print word,
|
||||
|
||||
is_ok = True
|
||||
|
||||
if len(heads) == 0:
|
||||
heads = []
|
||||
is_ok = append_if_ok(heads,word)
|
||||
|
||||
else:
|
||||
try:
|
||||
b = not (word.lower()).startswith(heads[-1].lower())
|
||||
except:
|
||||
continue
|
||||
if b:
|
||||
if len(result) > 1000 or len(heads) >= 128:
|
||||
fn = heads_to_name(heads)
|
||||
open("../zdic_json/"+fn+".json",'wb').write(json.dumps(result))
|
||||
|
||||
result = {}
|
||||
heads = []
|
||||
is_ok = append_if_ok(heads,word)
|
||||
else:
|
||||
is_ok = append_if_ok(heads,word)
|
||||
|
||||
|
||||
traditional = first(re.findall(r"#444\">\((.*?)\)",content)) + first(re.findall(r"繁体字:(.*?)</li>",content))
|
||||
pinyin = first(re.findall(r"拼音.*?:(.*?)[<\t ]",content))
|
||||
zhuyin = first(re.findall(r"注音.*?:(.*?)<",content))
|
||||
definition = re.findall(r"<li.*?>(.*?)</li>",content)
|
||||
if len(definition) == 0:
|
||||
definition = [content]
|
||||
|
||||
definition = [re.sub(r"<.*?>","",d).strip() for d in definition]
|
||||
definition = [re.sub(r"^.{0,1}\d.*?[\.\)]","",d).strip() for d in definition]
|
||||
definition = [re.sub(r"===汉英互译===","",d).strip() for d in definition]
|
||||
#definition = [re.sub(r"【解释】:","",d).strip() for d in definition]
|
||||
traditional = re.sub(r"<.*?>","",traditional).replace(" ","")
|
||||
|
||||
definition = [rem_bad_char(d) for d in definition
|
||||
if (len(pinyin) == 0 or (pinyin not in d and pinyin.replace(" ","") not in d)) \
|
||||
and (len(zhuyin) == 0 or zhuyin not in d ) \
|
||||
and "繁体" not in d
|
||||
and "简体" not in d
|
||||
and "郑码" not in d
|
||||
and "拼音" not in d
|
||||
and "粤语:" not in d
|
||||
and "潮州话:" not in d
|
||||
and "UniCode" not in d
|
||||
and "◎" not in d
|
||||
and len(d.replace(word,"")) > 0
|
||||
]
|
||||
|
||||
if len(definition) == 0:
|
||||
definition = [rem_bad_char(content)]
|
||||
definition = [re.sub(r"<.*?>","",d).strip() for d in definition]
|
||||
|
||||
|
||||
thing = {"TRD":traditional.decode('utf-8',"ignore"),
|
||||
"PRN":[pinyin.decode('utf-8',"ignore"), zhuyin.decode('utf-8',"ignore")],
|
||||
"DEF":definition,
|
||||
}
|
||||
if is_ok:
|
||||
result[word]=thing
|
||||
else:
|
||||
weirdo[word]=thing
|
||||
|
||||
open("../zdic_json/extended.json",'wb').write(json.dumps(weirdo))
|
||||
|
||||
8
tools/to_txt.py
Executable file
8
tools/to_txt.py
Executable file
@@ -0,0 +1,8 @@
|
||||
#!/usr/bin/env python2.7
|
||||
from mobi import Mobi
|
||||
|
||||
book = Mobi("../zdic.prc");
|
||||
book.parse();
|
||||
|
||||
for record in book:
|
||||
print record,
|
||||
Reference in New Issue
Block a user