commit 86eb38d053dbd504d4b92f180cb3ccbbf336d985 Author: lingdong huang Date: Thu Aug 6 13:32:21 2020 -0400 initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..340170b --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +.DS_Store +*/DS_Store +zdic.prc +zdic.txt +zdic_json/* +tools/mobi/* +zdic-cli-* diff --git a/README.md b/README.md new file mode 100644 index 0000000..36bc26f --- /dev/null +++ b/README.md @@ -0,0 +1,88 @@ +# zdic-cli + +An alternative, offline, regex-supporting, command-line interface to [zdic (漢典)](zdic.net), featuring: + +- No internet connection required +- Full text search with regex: a command for finding characters/phrases in body text of definitions. +- Colorful text for highlighting entries + + +![](screenshots/screen000.png) +![](screenshots/screen001.png) +![](screenshots/screen002.png) +![](screenshots/screen003.png) +![](screenshots/screen004.png) + + +## Downloads + +Standalone version available for windows, mac and linux. Please refer to the Releases page. + +## Usage + +zdic-cli runs as a REPL loop, and there are 5 types of commands you can enter: + +``` +def 甲 display entry for 甲 +pre 甲 list words that starts with 甲 +has 甲 乙 ... list words that contains 甲 and 乙 ... +txt 甲 list words whose full entry text contains 甲 (regex supported) +sel n display entry at previously returned list index n +``` + +`def` is probably the most common one for simply looking up characters/words. `pre` `has` `txt` will return an enumerated list listing all elligible entries, and `sel 0`/`sel 1`/`sel n` can be used to select from the list. + + +## Development setup + +**This section is for compiling from source (and is somewhat complex), if you just would like to use the software, please check out the Releases page.** + +### Dependencies + +- node.js/npm +- python2. Tested on 2.7. It has to be python 2 instead of 3 because much of the data-processing work were done way back in a time when 2 was the norm. Sorry folks, but if you're a python3 purist feel free to send a PR! +- pkg (optional, to build standalone binaries for multiple platforms) `npm install -g pkg` + +### Downloading and compiling the dictionary files + +This repo does not contain the dictonary files themselves as they're too large. Instead, a shell script (`setup.sh`) is provided to automate the process: + +- Automatically download the original Kindle dictionary format (.PRC) from the internet. The download link hardcoded in `setup.sh` might fail in the future, in which case a google search for `汉典.prc` should yield alternative resources. +- Automatically download a python library (kroo/mobi-python) for parsing mobi files. A rather old library and have some oddities, a find-and-replace script will be automatically run to patch some glitches in the source code :P +- `python/to_txt.py` is run to extract a raw `txt` file from the kindle format `prc`. +- `phthon/to_json.py` is run to generate a directory of `json` files from the `txt` to make lookup and formatting more efficient. + +Run the shell script with: + +``` +sh setup.sh +``` + +At this point you'll have `zdic.prc`, `zdic.txt` and directory `zdic_json/`. Only `zdic_json` is needed, so if everything went well with the script, you can freely delete the other two and gain some 700MB of free space :) + +### Compiling the binary + +At this point you can also run the software by simply doing: + +``` +node index.js +``` + +You can also package it into a binary using + +``` +pkg . +``` + +provided you have the node and pkg dependencies installed. + +Tip: to run the software by typing `zdic` anywhere, you can symlink it to `/user/bin`, e.g. + +``` +ln -s /user/bin/zdic path/to/zdic-cli/zdic-cli +``` + + + + + diff --git a/index.js b/index.js new file mode 100644 index 0000000..329fdc2 --- /dev/null +++ b/index.js @@ -0,0 +1,259 @@ +const fs = require("fs") +const readline = require('readline'); + +const dict_path = __dirname+'/zdic_json' + +function locate_file(word){ + const folder = dict_path; + var files = fs.readdirSync(folder) + for (var i = 0; i < files.length; i++){ + if (files[i].includes(word.slice(0,1))){ + return files[i] + } + } + return "extended.json" +} + +function read_json(file_path){ + return JSON.parse(fs.readFileSync(dict_path+"/"+file_path).toString()) +} + +function define(word){ + var dict_path = locate_file(word) + var dict = read_json(dict_path) + var ret = [] + if (word in dict){ + return [word,dict[word]] + } + return null; +} + +function starts_with(word){ + var ret = [] + var dict = read_json(locate_file(word)) + for (w in dict){ + if (w.startsWith(word)){ + ret.push(w) + } + } + return ret; +} + +function contains(words){ + // console.log(words) + var ret = [] + var files = fs.readdirSync(dict_path) + for (var i = 0; i < files.length; i++){ + if (!files[i].endsWith(".json")){ + continue; + } + var dict = read_json(files[i]); + for (var w in dict){ + var ok = true; + for (var j = 0; j < words.length; j++){ + if (!w.includes(words[j])){ + ok = false; + break; + } + } + if (ok){ + ret.push(w); + } + } + } + return ret; +} + +function full_text_search(word,view_len,callback){ + var ret = [] + var pd = Math.floor((view_len-word.length)/2); + var files = fs.readdirSync(dict_path) + var re = new RegExp(word); + for (var i = 0; i < files.length; i++){ + if (!files[i].endsWith(".json")){ + continue; + } + var dict = read_json(files[i]); + for (var w in dict){ + for (var j = 0; j < dict[w]['DEF'].length; j++){ + var idx = dict[w]['DEF'][j].search(re) + if (idx == -1){ + continue; + } + var it = [ret.length,w,dict[w]['DEF'][j].slice(Math.max(idx-pd,0),idx+word.length+pd)]; + ret.push(w); + callback(it); + break; + } + } + } + return ret; +} + +function render_def(word,entry){ + var result = "" + result += "\x1b[32m\033[1m"+word+"\x1b[0m" + if (entry['TRD'] != ''){ + result += " \x1b[32m\033[1m("+entry['TRD']+")\x1b[0m " + } + if (entry['PRN'][0] != ''){ + result += " \x1b[33m[ "+entry['PRN'][0].trim()+"" + } + if (entry['PRN'][1] != ''){ + result += " , "+entry['PRN'][1].trim() + } + if (entry['PRN'][0] != ''){ + result += " ]\x1b[0m" + } + result += "\n" + var n = process.stdout.columns-4; + + for (var i = 0; i < entry['DEF'].length; i++){ + result += "\x1b[2m〇\x1b[0m" + var j = 1; + var t = entry['DEF'][i]; + var isf = true; + for (var c of t){ + if (c == "《"){ + result += "\x1b[31m" + } + if (c == "~"){ + result += "\x1b[33m"+c+"\x1b[0m"; + if (!isf){ + result += "\x1b[2m" + } + }else{ + result += c; + } + + if (c == "》"){ + result += "\x1b[0m" + if (!isf){ + result += "\x1b[2m" + } + } + if (c == "。" && isf){ + result += "\x1b[2m" + isf = false; + } + j+=2; + if (j >= n){ + j = 2; + result += "\n  " + + } + } + result+="\x1b[0m\n" + } + result += "" + return result +} + +function char_pad(x,n){ + return (x+" ".repeat(n)).slice(0,n); +} +function render_item(i,x,n){ + return `\x1b[33m${i.toString().padStart(4)}\x1b[0m ${char_pad(x,n)}`; +} + +function render_list(lst){ + var npi = 6; + var npl = Math.floor((process.stdout.columns-2)/(npi*2+5)) + var result = ""; + for (var i = 0; i < lst.length; i++){ + var li = lst[i]; + li = li.replace(/[^一-鿿]/g,"") + if (!li.length){//oops + li = lst[i]; + } + result += render_item(i,li,npi); + if (i % npl == npl-1){ + result += "\n" + }else{ + result += "" + } + } + return result; +} + + +var commands = { + "def":["def x ","display entry for x"], + "pre":["pre x ","list words that starts with x"], + "has":["has x y ...","list words that contains x and y ..."], + "txt":["txt x ","list words whose full entry text contains x (regex supported)"], + "sel":["sel n ","display entry at previously returned list index n"], +} + +function main(prev){ + var curr = null; + var def = null; + var fail = false; + const rl = readline.createInterface({ + input: process.stdin, + output: process.stdout + }); + rl.question('> ', (answer) => { + try{ + answer = answer.trim(); + var cmd = answer.slice(0,3); + var arg = answer.slice(4); + if (!arg.length){ + console.log(`\x1b[36m${commands[cmd][0]} \x1b[0m${commands[cmd][1]}`); + }else{ + if (cmd == "def"){ + var def = define(arg); + if (def){ + console.log(render_def(...def)); + } + }else if (cmd == "pre"){ + curr = starts_with(arg); + console.log(render_list(curr)); + }else if (cmd == "has"){ + curr = contains(arg.split(" ")); + console.log(render_list(curr)); + }else if (cmd == "txt"){ + curr = full_text_search(arg,Math.floor((process.stdout.columns-20)/2),function(x){ + console.log(`${render_item(x[0],x[1],6)}\x1b[2m${x[2]}\x1b[0m`); + }); + }else if (cmd == "sel"){ + var def = define(prev[parseInt(arg)]); + if (def){ + console.log(render_def(...def)) + } + }else{ + console.log("\x1b[31munsupported command.\x1b[0m") + fail = true + } + if (!fail && (def == null && (curr == null || curr.length == 0))){ + console.log("\x1b[2m(0 result returned)\x1b[0m") + } + } + }catch(e){ + console.log("\x1b[31mcommand parse failed. trying as direct query...\x1b[0m") + // console.log(e) + try{ + var def = define(answer); + if (def){ + console.log(render_def(...def)); + }else{ + throw new Error(); + } + }catch(ee){ + console.log("\x1b[31mcommand parse totally failed.\x1b[0m") + } + } + rl.close() + main(curr||prev); + }); + +} + +console.log("╔═════════════════════════════════════════════╗") +console.log("║\x1b[31m 漢 典 CLI \x1b[0m║") +console.log("║Unoffical offline 漢典 (zdic.net) commandline║") +console.log("║\x1b[2m w/ data derived from `汉典.prc` (for Kindle)\x1b[0m║") +console.log("║\x1b[2m Lingdong Huang 2020 \x1b[0m║"); +console.log("╚═════════════════════════════════════════════╝") +console.log(`commands: ${Object.keys(commands).map(x=>( "\x1b[36m"+x+"\x1b[0m" )).join(",") }, run without arguments to see help`); +main(); diff --git a/package.json b/package.json new file mode 100644 index 0000000..69424d9 --- /dev/null +++ b/package.json @@ -0,0 +1,7 @@ +{ + "name":"zdic-cli", + "pkg": { + "assets": "zdic_json/*" + }, + "bin":"index.js" +} \ No newline at end of file diff --git a/screenshots/screen000.png b/screenshots/screen000.png new file mode 100644 index 0000000..65a6663 Binary files /dev/null and b/screenshots/screen000.png differ diff --git a/screenshots/screen001.png b/screenshots/screen001.png new file mode 100644 index 0000000..5761b77 Binary files /dev/null and b/screenshots/screen001.png differ diff --git a/screenshots/screen002.png b/screenshots/screen002.png new file mode 100644 index 0000000..0a88d5b Binary files /dev/null and b/screenshots/screen002.png differ diff --git a/screenshots/screen003.png b/screenshots/screen003.png new file mode 100644 index 0000000..c7c3858 Binary files /dev/null and b/screenshots/screen003.png differ diff --git a/screenshots/screen004.png b/screenshots/screen004.png new file mode 100644 index 0000000..f4a19d8 Binary files /dev/null and b/screenshots/screen004.png differ diff --git a/setup.sh b/setup.sh new file mode 100644 index 0000000..364eaeb --- /dev/null +++ b/setup.sh @@ -0,0 +1,25 @@ +echo "downloading the dictionary..." +curl https://blog.xjpvictor.info/wp-content/uploads/汉典.prc > zdic.prc + +cd tools + +echo "downloading dependencies..." +git clone https://github.com/kroo/mobi-python +cp -r mobi-python/mobi ./mobi +rm -rf mobi-python + +echo "patching bugs in dependencies..." +chmod +x patch.py +./patch.py + +echo "converting dictionary to txt... (this might take a while ~10 mins)" +chmod +x to_txt.py +./to_txt.py > ../zdic.txt + +echo "converting txt to json..." +mkdir ../zdic_json +chmod +x to_json.py +./to_json.py + +echo "done setting up!" +echo "to use the app, either run 'pkg .' to package into a binary, or run 'node index.js' directly." \ No newline at end of file diff --git a/tools/patch.py b/tools/patch.py new file mode 100755 index 0000000..4876b32 --- /dev/null +++ b/tools/patch.py @@ -0,0 +1,16 @@ +#!/usr/bin/env python2.7 +t = open("mobi/__init__.py",'r').read().replace( + "uncompress_lz77(self.contents[self.records[recordnum]['record Data Offset']:self.records[recordnum+1]['record Data Offset']-self.config['mobi']['extra bytes']])", + "result = uncompress_lz77(self.contents[self.records[recordnum]['record Data Offset']:self.records[recordnum+2]['record Data Offset']-self.config['mobi']['extra bytes']])" +).replace( + "for record in range(1, self.config['mobi']['First Non-book index'] - 1):", + "for record in range(0, 10000000,1):" +) +open("mobi/__init__.py",'w').write(t) +t = open("mobi/lz77.py",'r').read().replace( + "print(\"WARNING:","#print(\"WARNING:" +).replace( + "\" beginning of text!", + "#" +) +open("mobi/lz77.py",'w').write(t) \ No newline at end of file diff --git a/tools/to_json.py b/tools/to_json.py new file mode 100755 index 0000000..595ff37 --- /dev/null +++ b/tools/to_json.py @@ -0,0 +1,114 @@ +#!/usr/bin/env python2.7 +# -*- coding: utf-8 -*- +import os +import re +import io +import sys +import json + +reload(sys) +sys.setdefaultencoding('utf8') + +txt = open("../zdic.txt",'r').read() +ent = re.findall(r'

([^<>]*?)

(.*?)', txt) +ent = [(x[0].strip(),x[1]) for x in ent] +ent = [x for x in ent if x[0] != ""] + +def first(l): + return "" if len(l) == 0 else l[0] + +def rem_bad_char(t): + return t.decode('utf-8','ignore') + +def heads_to_name(t): + u = [x for x in t if len(hex(ord(x)))-2 <= 4] + u = ("".join(u)).decode("utf-8","ignore") + #print u + return u + +def append_if_ok(heads,word): + try: + fn = "../zdic_json/"+heads_to_name(heads+[unicode(word)[0].lower()])+".json" + open(fn,'wb').write("test.") + os.remove(fn) + heads.append(unicode(word)[0].lower()) + return True + except: + print("BAD WORD NAME:",word) + return False + +heads = [] +result = {} +weirdo = {} +for e in ent: + + word = e[0] + content = e[1] + print word, + + is_ok = True + + if len(heads) == 0: + heads = [] + is_ok = append_if_ok(heads,word) + + else: + try: + b = not (word.lower()).startswith(heads[-1].lower()) + except: + continue + if b: + if len(result) > 1000 or len(heads) >= 128: + fn = heads_to_name(heads) + open("../zdic_json/"+fn+".json",'wb').write(json.dumps(result)) + + result = {} + heads = [] + is_ok = append_if_ok(heads,word) + else: + is_ok = append_if_ok(heads,word) + + + traditional = first(re.findall(r"#444\">\((.*?)\)",content)) + first(re.findall(r"繁体字:(.*?)",content)) + pinyin = first(re.findall(r"拼音.*?:(.*?)[<\t ]",content)) + zhuyin = first(re.findall(r"注音.*?:(.*?)<",content)) + definition = re.findall(r"(.*?)",content) + if len(definition) == 0: + definition = [content] + + definition = [re.sub(r"<.*?>","",d).strip() for d in definition] + definition = [re.sub(r"^.{0,1}\d.*?[\.\)]","",d).strip() for d in definition] + definition = [re.sub(r"===汉英互译===","",d).strip() for d in definition] + #definition = [re.sub(r"【解释】:","",d).strip() for d in definition] + traditional = re.sub(r"<.*?>","",traditional).replace(" ","") + + definition = [rem_bad_char(d) for d in definition + if (len(pinyin) == 0 or (pinyin not in d and pinyin.replace(" ","") not in d)) \ + and (len(zhuyin) == 0 or zhuyin not in d ) \ + and "繁体" not in d + and "简体" not in d + and "郑码" not in d + and "拼音" not in d + and "粤语:" not in d + and "潮州话:" not in d + and "UniCode" not in d + and "◎" not in d + and len(d.replace(word,"")) > 0 + ] + + if len(definition) == 0: + definition = [rem_bad_char(content)] + definition = [re.sub(r"<.*?>","",d).strip() for d in definition] + + + thing = {"TRD":traditional.decode('utf-8',"ignore"), + "PRN":[pinyin.decode('utf-8',"ignore"), zhuyin.decode('utf-8',"ignore")], + "DEF":definition, + } + if is_ok: + result[word]=thing + else: + weirdo[word]=thing + +open("../zdic_json/extended.json",'wb').write(json.dumps(weirdo)) + diff --git a/tools/to_txt.py b/tools/to_txt.py new file mode 100755 index 0000000..946adb0 --- /dev/null +++ b/tools/to_txt.py @@ -0,0 +1,8 @@ +#!/usr/bin/env python2.7 +from mobi import Mobi + +book = Mobi("../zdic.prc"); +book.parse(); + +for record in book: + print record,