#!/usr/bin/env python # (compatible with both Python 2.7 and Python 3) "Annotator Generator v3.384 (c) 2012-24 Silas S. Brown" # See http://ssb22.user.srcf.net/adjuster/annogen.html # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # http://www.apache.org/licenses/LICENSE-2.0 # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # If you want to compare this code to old versions, the old # versions are being kept in the E-GuideDog SVN repository on # http://svn.code.sf.net/p/e-guidedog/code/ssb22/adjuster # and on GitHub at https://github.com/ssb22/adjuster # and on GitLab at https://gitlab.com/ssb22/adjuster # and on BitBucket https://bitbucket.org/ssb22/adjuster # and at https://gitlab.developers.cam.ac.uk/ssb22/adjuster # and in China: https://gitee.com/ssb22/adjuster # although some early ones are missing. import sys,os,os.path,tempfile,time,re,subprocess,unicodedata import json,codecs from optparse import OptionParser if '--html-options' in sys.argv: print ("Usage: annogen.py [options]

Options:

"+", ".join(args)+": "+re.sub('(?<=[A-Za-z])([/=_])(?=[A-Za-z])',r'\1',re.sub('(--[A-Za-z-]*)',r'\1',kwargs.get("help","").replace("%default",str(kwargs.get("default","%default"))).replace('&','&').replace('<','<').replace('>','>'))).replace("BEFORE","before").replace("AFTER","after").replace("ALWAYS","always").replace(" ALL "," all ").replace(" LONG "," long ").replace(" NOT "," not ").replace("WITHOUT","without").replace("js:search:replace,","js:search:replace,")+"

") sys.exit() term = os.environ.get("TERM","") is_xterm = "xterm" in term ansi_escapes = is_xterm or term in ["screen","linux"] def isatty(f): return hasattr(f,"isatty") and f.isatty() if ansi_escapes and isatty(sys.stderr): clear_eol,reverse_on,reverse_off,bold_on,bold_off="\x1b[K","\x1b[7m","\x1b[0m","\x1b[1m","\x1b[0m" else: clear_eol,reverse_on,reverse_off,bold_on,bold_off=" "," **","** ","","" sys.stderr.write(bold_on+__doc__+bold_off+"\n") # not sys.stdout: may or may not be showing --help (and anyway might want to process the help text for website etc) options, args = parser.parse_args() globals().update(options.__dict__) try: import thread except: import _thread as thread # Python 3 import gc ; gc.disable() # should be OK if we don't create cycles (TODO: run gc.collect() manually after init, just in case?) def warn(msg): sys.stderr.write("Warning: "+msg+"\n") if "PyPy" in sys.version: warn("with annogen, PyPy is likely to run 60% slower than python") # (not to mention concurrent.futures being less likely to be available) if ybytes: ybytes=int(ybytes) if ybytes_max: ybytes_max=int(ybytes_max) else: ybytes_max = ybytes if yarowsky_debug: yarowsky_debug=int(yarowsky_debug) else: yarowsky_debug = 0 if normalise_debug: normalise_debug=int(normalise_debug) else: normalise_debug = 0 ybytes_step = int(ybytes_step) ymax_threshold = int(ymax_threshold) def errExit(msg): try: if not outfile==getBuf(sys.stdout): outfile.close() ; rm_f(c_filename) except: pass # works only if got past outfile opening sys.stderr.write(msg+"\n") ; sys.exit(1) if args: errExit("Unknown argument "+repr(args[0])) if sharp_multi and not annotation_names and (browser_extension or existing_ruby_lang_regex): errExit("--sharp-multi requires --annotation-names to be set if --browser-extension or --existing-ruby-lang-regex") if existing_ruby_lang_regex: while len(existing_ruby_lang_regex.split(','))/dev/null >/dev/null") for f in f[1:].split(','): if ':' in f: f,fSR = f.split(':',1) else: fSR=None dat = open(f,"rb").read() if fSR: fSR = fSR.split(':') for i in range(0,len(fSR),2): if not B(fSR[i]) in dat: errExit("extra-js with search and replace: unable to find "+repr(fSR[i])+" in "+f) dat = dat.replace(B(fSR[i]),B(fSR[i+1])) if can_check_syntax: out = err = True if os.path.exists("/dev/shm"): # node -c /dev/stdin can fail on some installations of GNU/Linux (but /dev/shm can fail on others, so try both) fn="/dev/shm/"+str(os.getpid())+".js" open(fn,"wb").write(dat) out,err = subprocess.Popen("node -c "+fn,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE).communicate() os.remove(fn) if out or err: out0,err0 = out,err out,err = subprocess.Popen("node -c /dev/stdin",shell=True,stdin=subprocess.PIPE,stdout=subprocess.PIPE,stderr=subprocess.PIPE).communicate(dat) if (out or err) and not out0==True: out,err = out0+out,err0+err if out or err: errExit("Syntax check failed for extra-js file "+f+"\n"+"node stdout: "+repr(out)+"\nnode stderr: "+repr(err)) else: warn("No syntax checker available for "+f) m=re.search(br"$[^)]*$\s*=>\s*{",dat) if m: errExit(f+" seems to have arrow function (breaks compatibility with Android 4.x): "+repr(m.group())) # TODO: also check for ||= (but not in comments; comments would need rm 1st); ||= requires Chrome 85 extra_js += dat ; del dat,fSR if extra_js.rstrip() and not B(extra_js.rstrip()[-1:]) in b';}': errExit("--extra-js must end with a semicolon or a closing brace") if existing_ruby_js_fixes.startswith("@"): existing_ruby_js_fixes = open(existing_ruby_js_fixes[1:],"rb").read() if browser_extension and re.search("erHTML *=[^=]",existing_ruby_js_fixes): warn("Code in --existing-ruby-js-fixes that sets innerHTML or outerHTML might result in an extension that's not accepted by Firefox uploads") jPackage = None if java: if not '//' in java: errExit("--java must include a // to separate the first part of the path from the package name") jSrc,jRest=java.rsplit('//',1) if '.' in jRest: errExit("--java must be ...src//org/example/package not ...src//org.example.package") # (TODO: fix it automatically in both jRest and java? only on the right-hand side of the //) jPackage = jRest.replace('/','.') if 'NewFunc' in jPackage: errExit("Currently unable to include the string 'NewFunc' in your package due to an implementation detail in annogen's search/replace operations") if not c_filename and isatty(sys.stdout): c_filename = tempfile.gettempdir()+os.sep+"annotator.c" def shell_escape(arg): if re.match("^[A-Za-z0-9_=/.%+,:@-]*$",arg): return arg return "'"+arg.replace("'",r"'\''")+"'" if sharp_multi: if python: errExit("sharp-multi not yet implemented in Python") # TODO: easy enough elif windows_clipboard: errExit("sharp-multi not yet implemented for windows-clipboard") # would need a way to select the annotator, probably necessitating a GUI on Windows if java or javascript or python or dart: def cOnly(param): errExit(param+" not yet implemented in any language other than C, so cannot be used with --java, --javascript, --python or --dart") if windows_clipboard: cOnly("--windows-clipboard") if library: cOnly("--library") if not outcode=="utf-8": cOnly("Non utf-8 outcode") if compress: cOnly("--compress") if sum(1 for x in [java,javascript,python,dart] if x) > 1: errExit("Outputting more than one programming language on the same run is not yet implemented") if java: if android and not "/src//" in java: errExit("When using --android, the last thing before the // in --java must be 'src' e.g. --java=/workspace/MyProject/src//org/example/package") if not compile_only: # (delete previous files, only if we're not a subprocess) os.system("mkdir -p "+shell_escape(java)) for f in os.listdir(java): if f.endswith(".java") and f.startswith("z"): os.remove(java+os.sep+f) c_filename = java+os.sep+"Annotator.java" if android: os.system("rm -rf "+shell_escape(jSrc+"/../bin")) # needed to get rid of old *.class files that might be no longer used for d in ["assets","bin","gen","res/layout","res/menu","res/values","res/xml"]: os.system("mkdir -p "+shell_escape(jSrc+"/../"+d)) elif c_filename.endswith(".c"): if javascript: c_filename = c_filename[:-2]+".js" elif dart: c_filename = c_filename[:-2]+".dart" else: c_filename = c_filename[:-2]+".py" elif windows_clipboard: if library: errExit("Support for having both --windows-clipboard and --library at the same time is not yet implemented") # ditto if c_compiler=="cc -o annotator": c_compiler="i386-mingw32-gcc -o annoclip.exe" if not outcode=="utf-8": errExit("outcode must be utf-8 when using --windows-clipboard") elif library: if c_compiler=="cc -o annotator": c_compiler="gcc -shared -fPIC -Wl,-soname,annotator.so.1 -o libannotator.so.1 -lc" if js_6bit: if not javascript: errExit("--js-6bit requires --javascript") # or just set js_6bit=False in these circumstances? import urllib if dart: js_utf8 = not dart_datafile if dart_datafile and any(x in dart_datafile for x in "'\\$"): errExit("Current implementation cannot cope with ' or \\ or $ in dart_datafile") elif dart_datafile: errExit("--dart-datafile requires --dart") if zlib: if javascript: errExit("--zlib not supported with Javascript") del zlib try: from zopfli import zlib # pip install zopfli zlib._orig_compress = zlib.compress zlib.compress = lambda s,level: zlib._orig_compress(s) # delete level zlib_name = "zopfli" except: import zlib zlib_name = "zlib" if windows_clipboard: warn("--zlib with --windows-clipboard is inadvisable because ZLib is not typically present on Windows platforms. If you really want it, you'll need to figure out the compiler options and library setup for it.") if dart and not dart_datafile: warn("--zlib without --dart-datafile might not be as efficient as you'd hope (and --zlib prevents the resulting Dart code from being compiled to a \"Web app\" anyway)") # as it requires dart:io if rulesFile: if not (read_rules or write_rules): errExit("rulesFile requires --read-rules or --write-rules") elif read_rules and write_rules: errExit("--read-rules and --write-rules are mutually exclusive") if priority_list: errExit("can't set both rulesFile and priority-list") # because PairPriorities uses corpus, not rules elif read_rules or write_rules: errExit("--read-rules or --write-rules requires rulesFile") if java or javascript or python or dart: c_compiler = None try: xrange # Python 2 except: xrange,unichr,unicode = range,chr,str # Python 3 if post_normalise: if not (javascript or java or freq_count): errExit('--post-normalise currently requires --javascript or --java (or --freq-count)') if type("")==type(u""): # Python 3 (this requires 3.5+, TODO: support 3.3/3.4 ?) import importlib.util as iu s = iu.spec_from_file_location("post.normalise", post_normalise) post_normalise = iu.module_from_spec(s) ; s.loader.exec_module(post_normalise) else: # Python 2 import imp post_normalise = imp.load_source('post.normalise', post_normalise) post_normalise = post_normalise.table for k,v in list(post_normalise.items()): if not (k<=0xFFFF and v<=0xFFFF and len(unichr(k).encode('utf-8'))==len(unichr(v).encode('utf-8'))): del post_normalise[k] # BMP only for now, and only mappings that don't change UTF-8 length so inBytes / origInBytes are sync'd elif k==v: del post_normalise[k] # don't need identity mappings problems = set(post_normalise.keys()).intersection(set(post_normalise.values())) if problems: errExit("--post-normalise table problem: both keys AND values have "+", ".join(hex(h) for h in sorted(list(problems)))) if type(u"")==type(""): post_normalise_translate = lambda x:x.translate(post_normalise) # Python 3 can use the dictionary as-is else: post_normalise_translate = lambda u: u''.join(unichr(post_normalise.get(ord(i),ord(i))) for i in u) # as Python 2 .translate can take only len=256 (at least as documented; some versions can do more but not all tested), so we'd better write it out ourselves try: import locale terminal_charset = locale.getpreferredencoding() except: terminal_charset = None if not terminal_charset: terminal_charset = "utf-8" if existing_ruby_shortcut_yarowsky: if not (android and ybytes and glossfile): errExit("--existing-ruby-shortcut-yarowsky makes sense only when generating an Android app with both ybytes and glossfile set") def T(s): if type(s)==type(u""): return s # Python 3 return s.decode(terminal_charset) if keep_whitespace: keep_whitespace = set(T(keep_whitespace).split(',')) if ymax_limitwords: ymax_limitwords = set(T(ymax_limitwords).split(',')) if multiword_end_avoid: multiword_end_avoid = set(T(multiword_end_avoid).split(',')) if status_prefix: status_prefix += ": " else: status_prefix = "" if diagnose: diagnose=T(diagnose) diagnose_limit = int(diagnose_limit) max_words = int(max_words) if single_words: max_words = 1 if read_rules and diagnose_manual: errExit("--diagnose-manual is not compatible with --read-rules") suffix_minlen=int(suffix_minlen) if compress: squashStrings = set() ; squashReplacements = [] def squashFinish(): global squashStrings # so can set it to "done" at end tokens = set() for s in squashStrings: tokens.update(list(S(s))) totSaved = 0 tokens = [chr(t) for t in range(1,256) if not chr(t) in tokens] ; orig_tokens = set(tokens) pairs = [chr(0)] * 512 while tokens and squashStrings: t = tokens.pop() counts = {} for s in squashStrings: # To make decompression as fast and compact as possible, each 1-byte token represents 2 bytes exactly. In practice allowing it to represent variable lengths of whole bytes up to 4 is not likely to improve the compression by more than 3.2% (that's 3.2% of the 10-20% it achieves, so it's around 0.5%), and not very much better for length 9, so we might as well stick with this simpler scheme unless we do real LZMA or whatever. for i in range(0,len(s)-1): k = s[i:i+2] if S(k[:1]) in orig_tokens or S(k[1:]) in orig_tokens: continue # to keep the decoder simple, don't set things up so it needs to recurse (being able to recurse within the 2-byte expansion is very unlikely to save anything in practice anyway - it didn't on my annotators - so not worth implementing the decoder for) counts[k] = counts.get(k,0) + 1 bSaved, k = max((v,k) for k,v in counts.items()) pairs[ord(t)] = k[:1] pairs[ord(t)+256] = k[1:] squashReplacements.append((k,B(t))) # this assumes we won't be doing things like 'if ALL instances of a byte end up in our tokens, add the byte's original value as an extra token' for s in squashStrings: s2 = s.replace(k,B(t)) if not s2==s: squashStrings.remove(s) ; squashStrings.add(s2) totSaved += bSaved sys.stderr.write("Compress: %d/%d tokens, %d bytes saved%s\r" % (len(orig_tokens)-len(tokens),len(orig_tokens),totSaved,clear_eol)) ; sys.stderr.flush() squashStrings = "done" while len(pairs) > 256 and pairs[-1]==chr(0): pairs = pairs[:-1] sys.stderr.write("\n") if totSaved < len(pairs)+50: sys.stderr.write("Warning: --compress on this data made it bigger! Consider dropping --compress\n") # 50 as rough guess for OutWriteDecompress binary (probably about 12 instructions at 4+ bytes each) return c_escapeRawBytes(b"".join(B(p) for p in pairs)) decompress_func=br""" static unsigned char pairs[]="%%PAIRS%%"; static void OutWriteDecompress(const char *s) { while(*s) { int i=(unsigned char)*s; if (pairs[i]) { OutWriteByte(pairs[i]); OutWriteByte(pairs[i|0x100]); } else OutWriteByte(*s); s++; } }""" if sharp_multi: decompress_func += br""" static int ns; static void OutWriteNSB(int b) { if(b=='#') ns++; else if(ns==numSharps) OutWriteByte(b); } static void OutWriteDecompressP(const char *s) { ns=0; while(*s && ns<=numSharps) { int i=(unsigned char)*s; if (pairs[i]) { OutWriteNSB(pairs[i]); OutWriteNSB(pairs[i|0x100]); } else OutWriteNSB(*s); s++; } }""" def squash(byteStr): if squashStrings == "done": for k,v in squashReplacements: byteStr = byteStr.replace(k,v) else: squashStrings.add(byteStr) # for the dry run return byteStr elif sharp_multi: decompress_func = br""" static void OutWriteStrP(const char *annot) { int ns = numSharps; while(ns--) { annot = strchr(annot,'#'); if (!annot) return; else annot++; } char* m = strchr(annot,'#'); if(m) OutWriteStrN(annot,m-annot); else OutWriteStr(annot); } """ else: decompress_func = b"" def annotMap(varName="annotNo",mayNeedParen=False): r = "" if annotation_map: for i in annotation_map.split(","): k,v = i.split('=') r += varName+"=="+str(int(k)-1)+"?"+str(int(v)-1)+":" r += varName if mayNeedParen and "==" in r: r="("+r+")" return B(r) if c_filename and os.sep in c_filename: cfn = c_filename[c_filename.rindex(os.sep)+1:] else: cfn = c_filename if library: c_preamble = br""" /* This library is NOT thread safe. But you can use it with single-threaded or multiprocess code like Web Adjuster (not in WSGI mode). To wrap this library in Python (2 or 3), you can do: from ctypes import CDLL,c_char_p,c_int alib = CDLL("./libannotator.so.1") _annotate,_afree = alib.annotate,alib.afree _annotate.restype = c_char_p _annotate.argtypes = [c_char_p""" if sharp_multi: c_preamble += b",c_int" c_preamble += b",c_int]" if outcode=="utf-8": c_preamble += br""" _annotateRL = alib.annotateRawLatinize _annotateRL.restype = c_char_p _annotateRL.argtypes = [c_char_p""" if sharp_multi: c_preamble += b",c_int" c_preamble += b"]\ndef annotR(txt" if sharp_multi: c_preamble += b",aType=0" c_preamble += br"""): if type(txt)==type(u''): txt = txt.encode('utf-8') r = _annotateRL(txt""" if sharp_multi: c_preamble += b",aType" c_preamble += br""") _afree() ; return r""" c_preamble += b"\ndef annotate(txt" if sharp_multi: c_preamble += b",aType=0" c_preamble += br""",aMode=1): "aMode: 0 = raw, 1 = ruby (default), 2 = braces" if type(txt)==type(u''): txt = txt.encode('"""+B(outcode)+br"""') r = _annotate(txt""" if sharp_multi: c_preamble += b",aType" c_preamble += br""",aMode) _afree() ; return r # then for Web Adjuster you can do, for example, # adjuster.annotFunc1 = lambda t:annotate(t""" if sharp_multi: c_preamble += b",1" c_preamble += b",1)\n" if outcode=="utf-8": if sharp_multi: c_preamble += b"# adjuster.annotFunc1R = lambda t:annotR(t,1)" else: c_preamble += b"# adjuster.annotFunc1R = annotR" c_preamble += br""" # adjuster.options.htmlFilter = "*annotFunc1#*annotFunc1R" # adjuster.options.htmlFilterName = "ruby#annot-only" """ else: c_preamble += br""" # adjuster.options.htmlFilter = "*annotFunc1" """ if not outcode=="utf-8": c_preamble += br""" # but BEWARE Web Adjuster assumes UTF-8; you'd better write a wrapper to re-code it """ # (TODO: automate this?) c_preamble += br""" Compile with: gcc -shared -fPIC -Wl,-soname,annotator.so.1 -o libannotator.so.1 annotator.c -lc */ """ if cfn: c_preamble=c_preamble.replace(b"annotator.c",B(cfn)) c_preamble += br""" #include #include """ c_defs = br"""static const unsigned char *readPtr, *writePtr, *startPtr; static char *outBytes; static size_t outWriteLen,outWritePtr; #define NEXTBYTE (*readPtr++) #define NEXT_COPY_BYTE (*writePtr++) #define COPY_BYTE_SKIP writePtr++ #define COPY_BYTE_SKIPN(n) writePtr += (n) #define POSTYPE const unsigned char* #define THEPOS readPtr #define SETPOS(p) (readPtr=(p)) #define PREVBYTE readPtr-- #define FINISHED (!(*readPtr)) static void OutWriteStrN(const char *s,size_t l) { size_t newLen = outWriteLen; while (outWritePtr+l > newLen) newLen *= 2; if (newLen > outWriteLen) { char *ob2 = realloc(outBytes,newLen); if (!ob2) return; /* This check is meaningless if the kernel overcommits, but I don't know if that's true on (all versions of) Android. */ outBytes = ob2; outWriteLen = newLen; } memcpy(outBytes+outWritePtr, s, l); outWritePtr += l; } static void OutWriteStr(const char *s) { OutWriteStrN(s,strlen(s)); } static void OutWriteByte(char c) { if (outWritePtr >= outWriteLen) { size_t newLen = outWriteLen * 2; char *ob2 = realloc(outBytes,newLen); if (!ob2) return; /* This check is meaningless if the kernel overcommits, but I don't know if that's true on (all versions of) Android. */ outBytes = ob2; outWriteLen = newLen; } outBytes[outWritePtr++] = c; } int near(char* string) { const unsigned char *startFrom = readPtr-nearbytes, *end = readPtr+nearbytes; if (startFrom < startPtr) startFrom = startPtr; size_t l=strlen(string); end -= l; while (*startFrom && startFrom <= end) { if(!strncmp(startFrom,string,l)) return 1; startFrom++; } return 0; } void matchAll();""" c_defs += br""" void afree() { if(outBytes) free(outBytes); outBytes=NULL; } char *annotate(const char *input""" if sharp_multi: c_defs += b", int annotNo" c_defs += br""",int aMode) { readPtr=writePtr=startPtr=(char*)input; outWriteLen = strlen(startPtr)*5+1; /* initial guess (must include the +1 to ensure it's non-0 for OutWrite...'s *= code) */ afree(); outBytes = malloc(outWriteLen);""" if sharp_multi: c_defs += b" numSharps="+annotMap()+b";" c_defs += br""" annotation_mode = aMode; if(outBytes) { outWritePtr = 0; matchAll(); } if(outBytes) OutWriteByte(0); return outBytes; } """ if outcode=="utf-8": # (TODO: document this feature? non-utf8 versions ??) c_defs += br""" static void latinizeMatch(); static int latCap,latSpace; char *annotateRawLatinize(const char *input""" if sharp_multi: c_defs += b", int annotNo" c_defs += br""") { // "Bonus" library function, works only if annotation is Latin-like, // tries to improve the capitalisation when in 'raw' mode // (TODO: make this available in other annogen output formats? work into ruby mode??) char *tmp=annotate(input""" if sharp_multi: c_defs += b",annotNo" c_defs += br""",annotations_only); if(tmp) { tmp=strdup(tmp); if(tmp) { readPtr=writePtr=startPtr=tmp; afree(); outBytes=malloc(outWriteLen); if(outBytes) { outWritePtr = 0; latCap=1; latSpace=0; while(!FINISHED) { POSTYPE oldPos=THEPOS; latinizeMatch(); if (oldPos==THEPOS) { OutWriteByte(NEXTBYTE); COPY_BYTE_SKIP; } } } if(outBytes) OutWriteByte(0); free(tmp); } } return(outBytes); } static inline void doLatSpace() { if(latSpace) { OutWriteByte(' '); latSpace = 0; } } static void latinizeMatch() { POSTYPE oldPos=THEPOS; int nb = NEXTBYTE; if (latCap || latSpace) { if (nb >= '0' && nb <= '9') latSpace = 0; /* 1:1 */ else if(nb >= 'A' && nb <= 'Z') { latCap = 0; doLatSpace(); } else if(nb >= 'a' && nb <= 'z') { doLatSpace(); if(latCap) { latCap = 0; OutWriteByte(nb-('a'-'A')); return; } } else switch(nb) { case 0xC3: { int nb2 = NEXTBYTE; switch(nb2) { case 0x80: case 0x81: case 0x88: case 0x89: case 0x8c: case 0x8d: case 0x92: case 0x93: case 0x99: case 0x9a: doLatSpace(); latCap=0; break; case 0xa0: case 0xa1: case 0xa8: case 0xa9: case 0xac: case 0xad: case 0xb2: case 0xb3: case 0xb9: case 0xba: doLatSpace(); if (latCap) { OutWriteByte(0xC3); OutWriteByte(nb2-0x20); latCap=0; return; } } break; } case 0xC4: { int nb2 = NEXTBYTE; switch(nb2) { case 0x80: case 0x92: case 0x9a: case 0xaa: doLatSpace(); latCap=0; break; case 0x81: case 0x93: case 0x9b: case 0xab: doLatSpace(); if (latCap) { OutWriteByte(0xC4); OutWriteByte(nb2-1); latCap=0; return; } } break; } case 0xC5: { int nb2 = NEXTBYTE; switch(nb2) { case 0x8c: case 0xaa: doLatSpace(); latCap=0; break; case 0x8d: case 0xab: doLatSpace(); if (latCap) { OutWriteByte(0xC5); OutWriteByte(nb2-1); latCap=0; return; } } break; } case 0xC7: { int nb2 = NEXTBYTE; switch(nb2) { case 0x8d: case 0x8f: case 0x91: case 0x93: case 0x95: case 0x97: case 0x99: case 0x9b: doLatSpace(); latCap=0; break; case 0x8e: case 0x90: case 0x92: case 0x94: case 0x96: case 0x98: case 0x9a: case 0x9c: doLatSpace(); if (latCap) { OutWriteByte(0xC7); OutWriteByte(nb2-1); latCap=0; return; } } break; } } } switch(nb) { case 0xE2: /* could be opening quote */ if(NEXTBYTE==0x80) switch(NEXTBYTE) { case 0x98: case 0x9c: OutWriteByte(' '); latSpace = 0; } break; case 0xE3: /* could be Chinese stop or list-comma */ if(NEXTBYTE==0x80) switch(NEXTBYTE) { case 0x81: OutWriteByte(','); latSpace = 1; return; case 0x82: OutWriteByte('.'); latSpace = 1; latCap=1; return; } break; case 0xEF: /* could be full-width ascii */ switch(NEXTBYTE) { case 0xBC: { int b=NEXTBYTE; if (b >= 0x81 && b <= 0xbf) { int punc = b-(0x81-'!'); switch(punc) { case '(': OutWriteByte(' '); latSpace = 0; } OutWriteByte(punc); if (punc >= 0x90 && punc <= 0x99) latSpace = 0; else switch(punc) { case '!': case '.': case '?': latCap = 1; /* fall through */ case ')': case ',': case ':': case ';': latSpace = 1; } return; } break; } case 0xBD: { int b=NEXTBYTE; if (b >= 0x80 && b <= 0x9d) { /* TODO: capitalise if it's a letter (but probably not needed in most annotations) */ OutWriteByte(b-(0x80-'`')); return; } } break; } break; } SETPOS(oldPos); } """ have_annotModes = library # only ruby is needed by the Android code elif windows_clipboard: c_preamble = br"""/* For running on Windows desktop or WINE, compile with: i386-mingw32-gcc annoclip.c -o annoclip.exe For running on Windows Mobile 2003SE, 5, 6, 6.1 or 6.5, compile with: arm-cegcc-gcc annoclip.c -D_WINCE -Os -o annoclip-WM.exe or (if you have MSVC 2008 on a Windows machine), set PATH=%VCINSTALLDIR%\ce\bin\x86_arm;%PATH% set lib=%VCINSTALLDIR%\ce\lib\armv4 set include=%VSINSTALLDIR%\SmartDevices\SDK\Smartphone2003\Include;%VCINSTALLDIR%\ce\include;%VCINSTALLDIR%\include set CL=/TP /EHsc /D "_WIN32_WCE=0x420" /D UNDER_CE /D WIN32_PLATFORM_PSPC /D _WINCE /D _WINDOWS /D ARM /D _ARM_ /D _UNICODE /D UNICODE /D POCKETPC2003_UI_MODEL set LINK=/force:multiple /NODEFAULTLIB:oldnames.lib /SUBSYSTEM:WINDOWSCE /LIBPATH:"C:\Program Files\Windows Mobile 5.0 SDK R2\PocketPC\Lib\ARMV4I" /OUT:annoclip-WM.exe /MANIFEST:NO /STACK:65536,4096 /DYNAMICBASE:NO aygshell.lib coredll.lib corelibc.lib ole32.lib oleaut32.lib uuid.lib commctrl.lib cl /D_WIN32_IE=0x0400 /D_WIN32_WCE=0x0400 /Os /Og annoclip.c (you could try omitting /Os /Og for faster compilation, but RAM is likely important on the Windows Mobile device) */ #include #include #define UNICODE 1 /* for TCHAR to be defined correctly */ #include #ifdef near #undef near #endif FILE* outFile = NULL; unsigned char *p, *copyP, *pOrig; #define OutWriteStr(s) fputs((s),outFile) #define OutWriteStrN(s,n) fwrite((s),(n),1,outFile) #define OutWriteByte(c) fputc((c),outFile) #define NEXTBYTE (*p++) #define NEXT_COPY_BYTE (*copyP++) #define COPY_BYTE_SKIP copyP++ #define COPY_BYTE_SKIPN(n) copyP += (n) #define POSTYPE unsigned char* #define THEPOS p #define SETPOS(sp) (p=(sp)) #define PREVBYTE p-- #define FINISHED (!*p && !p[1]) """ if cfn: c_preamble=c_preamble.replace(b"annoclip.c",B(cfn)) c_defs = br"""static int near(char* string) { POSTYPE o=p; if(p>pOrig+nearbytes) o-=nearbytes; else o=pOrig; size_t l=strlen(string); POSTYPE max=p+nearbytes-l; while (*o && o <= max) { if(!strncmp((char*)o,(char*)string,l)) return 1; o++; } return 0; } """ have_annotModes = False # only ruby is needed by the windows_clipboard code else: c_preamble = br""" #include #include /* To include this code in another program, define the ifndef'd macros below + define Omit_main */ """ c_defs = br"""#ifndef NEXTBYTE /* Default definition of NEXTBYTE etc is to read input from stdin and write output to stdout. */ enum { Half_Bufsize = %%LONGEST_RULE_LEN%% }; static unsigned char lookahead[Half_Bufsize*2]; static size_t readPtr=0,writePtr=0,bufStart=0,bufLen=0; static int nextByte() { if (readPtr-bufStart +ybytes >= bufLen && !feof(stdin)) { if (bufLen == Half_Bufsize * 2) { memmove(lookahead,lookahead+Half_Bufsize,Half_Bufsize); bufStart += Half_Bufsize; bufLen -= Half_Bufsize; } bufLen += fread(lookahead+bufLen,1,Half_Bufsize*2-bufLen,stdin); } if (readPtr-bufStart == bufLen) return EOF; return lookahead[(readPtr++)-bufStart]; } static int near(char* string) { /* for Yarowsky-like matching */ size_t offset = readPtr-bufStart, l=strlen(string), maxPos = bufLen; if (maxPos >= l) maxPos -= l; else return 0; // can't possibly start after maxPos-l if (offset+nearbytes>l) { if (maxPos > offset+nearbytes-l) maxPos = offset+nearbytes-l; } else maxPos = 0; // (don't let it go below 0, as size_t is usually unsigned) if (offset>nearbytes) offset-=nearbytes; else offset = 0; while (offset <= maxPos) { if(!strncmp((char*)lookahead+offset,string,l)) return 1; offset++; } return 0; } #define NEXTBYTE nextByte() #define NEXT_COPY_BYTE lookahead[(writePtr++)-bufStart] #define COPY_BYTE_SKIP writePtr++ #define COPY_BYTE_SKIPN(n) writePtr += (n) #define POSTYPE size_t #define THEPOS readPtr /* or get it via a function */ #define SETPOS(p) (readPtr=(p)) /* or set via a func */ #define PREVBYTE readPtr-- #define FINISHED (feof(stdin) && readPtr-bufStart == bufLen) #define OutWriteStr(s) fputs((s),stdout) #define OutWriteStrN(s,n) fwrite((s),(n),1,stdout) #define OutWriteByte(c) putchar(c) #endif """ have_annotModes = True if have_annotModes: c_defs = br""" #ifndef Default_Annotation_Mode #define Default_Annotation_Mode ruby_markup #endif enum { annotations_only, ruby_markup, brace_notation, segment_only} annotation_mode = Default_Annotation_Mode; """ + c_defs c_switch1=br"""switch (annotation_mode) { case annotations_only: OutWriteDecompressP(annot); COPY_BYTE_SKIPN(numBytes); break; case ruby_markup:""" c_switch2=br"""break; case brace_notation: OutWriteByte('{'); for(;numBytes;numBytes--) OutWriteByte(NEXT_COPY_BYTE); OutWriteByte('|'); OutWriteDecompressP(annot); OutWriteByte('}'); break; case segment_only: for(;numBytes;numBytes--) OutWriteByte(NEXT_COPY_BYTE); break; }""" c_switch3 = b"if (annotation_mode == ruby_markup) {" c_switch4 = b"} else o(numBytes,annot);" else: c_switch1=c_switch2=c_switch3=c_switch4=b"" c_preamble += b'#include \n' if sharp_multi: c_preamble += b'#include \n' if zlib: c_preamble += b'#include "zlib.h"\n' if sharp_multi: c_preamble += b"static int numSharps=0;\n" version_stamp = B(time.strftime("generated %Y-%m-%d by ")+__doc__[:__doc__.index("(c)")].strip()) c_start = b"/* -*- coding: "+B(outcode)+b" -*- */\n/* C code "+version_stamp+b" */\n" c_start += c_preamble+br""" enum { ybytes = %%YBYTES%% }; /* for Yarowsky-like matching, minimum readahead */ static int nearbytes = ybytes; #define setnear(n) (nearbytes = (n)) """ + c_defs + br"""static int needSpace=0; static void s() { if (needSpace) OutWriteByte(""" if have_annotModes: c_start += b"annotation_mode==segment_only?'-':' '); /* (hyphen is probably the best separator character if our binary will be used for Gradint's espeak_preprocessors option) */" else: c_start += b"' ');" c_start += br""" else needSpace=1; /* for after the word we're about to write (if no intervening bytes cause needSpace=0) */ } static void s0() { if (needSpace) { OutWriteByte(""" if have_annotModes: c_start += b"annotation_mode==segment_only?'-':' '" else: c_start += b"' '" c_start += br"""); needSpace=0; } }""" + decompress_func + br""" static void c(int numBytes) { /* copyBytes, needSpace unchanged */ for(;numBytes;numBytes--) OutWriteByte(NEXT_COPY_BYTE); } static void o(int numBytes,const char *annot) { s();""" + c_switch1 + br""" OutWriteStr(""); for(;numBytes;numBytes--) OutWriteByte(NEXT_COPY_BYTE); OutWriteStr(""); OutWriteDecompressP(annot); OutWriteStr(""); """+c_switch2+br""" } static void o2(int numBytes,const char *annot,const char *title) {"""+c_switch3+br""" s(); OutWriteStr(""); for(;numBytes;numBytes--) OutWriteByte(NEXT_COPY_BYTE); OutWriteStr(""); OutWriteDecompressP(annot); OutWriteStr(""); """+c_switch4+b"}" if not sharp_multi: c_start = c_start.replace(b"OutWriteDecompressP",b"OutWriteDecompress") if not compress: c_start = c_start.replace(b"OutWriteDecompress",b"OutWriteStr") # and hence OutWriteDecompressP to OutWriteStrP c_end = br""" void matchAll() {""" if zlib: c_end += b" if(!data) init();\n" c_end += br""" while(!FINISHED) { POSTYPE oldPos=THEPOS; topLevelMatch(); if (oldPos==THEPOS) { needSpace=0; OutWriteByte(NEXTBYTE); COPY_BYTE_SKIP; } } }""" # (innerHTML support should be OK at least from Chrome 4 despite MDN compatibility tables not going back that far) annotation_font = [b"Times New Roman"] # Android has NotoSerif but you can't select it by name, it's mapped from "Times New Roman" (tested in Android 4.4, Android 10 and Android 12, however in Android 12 it does not work for printing, so we'll override it to "sans-serif" in Android 11+ below) # there's a more comprehensive list in the windows_clipboard code below, but those fonts are less likely found on Android # jsAddRubyCss is Android-only. Browser extensions instead use extension_rubycss. jsAddRubyCss=b"all_frames_docs(function(d) { if(d.rubyScriptAdded==1 || !d.body) return; var e=d.createElement('span'); e.innerHTML=''" if known_characters: jsAddRubyCss += b"""+''""" def sort20px(singleQuotedStr): # 20px is relative to zoom assert singleQuotedStr.startswith(b"'") and singleQuotedStr.endswith(b"'") if not android_template: return singleQuotedStr return singleQuotedStr.replace(b"20px",b"'+Math.round(20/Math.pow((ssb_local_annotator.canCustomZoom()?ssb_local_annotator.getRealZoomPercent():100)/100,0.6))+'px") # (do allow some scaling, but not by the whole zoom factor) def bookmarkJS(): "Returns inline JS expression (to be put in parens) that evaluates to HTML fragment to be added for bookmarks, and event-setup code to be added after (as Content-Security-Policy could be unsafe-inline + unsafe-eval)" assert not '"' in android, "bookmarkJS needs re-implementing if --android URL contains quotes: please %-escape it" should_show_bookmarks = B("(location.href=='"+android.replace("'",r"\'")+"'&&!document.noBookmarks)") # noBookmarks is used for handling ACTION_SEND, since it has the same href (TODO @lower-priority: use different href instead?) are_there_bookmarks = b"ssb_local_annotator.getBMs().replace(/,/g,'')" show_bookmarks_string = br"""'

'+(function(){var c='

Bookmarks you added

[Delete] '+title+''}return c+'

'})()+'

Clipboard

"); p=pOrig; copyP=p; matchAll(); free(pOrig); OutWriteStr(""); fclose(outFile); TCHAR fn2[sizeof(fname)]; int i; for(i=0; fname[i]; i++) fn2[i]=fname[i]; fn2[i]=(TCHAR)0; SHELLEXECUTEINFO sei; memset(&sei, 0, sizeof(sei)); sei.cbSize = sizeof(sei); sei.lpVerb = TEXT("open"); sei.lpFile = fn2; sei.nShow = SW_SHOWNORMAL; if (!ShellExecuteEx(&sei)) errorExit("ShellExecuteEx"); // TODO: sleep(); remove{fname); ? // (although it will probably be the same on each run) DestroyWindow(win); // TODO: needed? } """ elif not library: c_end += br""" #ifndef Omit_main int main(int argc,char*argv[]) { int i=1;""" if sharp_multi: c_end += br""" if(i

""" # The versionCode, versionName and sharedUserId attributes in the above are also picked up on in the code below if epub: android_manifest += br"""""" # API 23 (Android 6) needs extra code to request this permission # (which we don't do), but Send URIs should arrive as content:// # (they should be content:// on Android 5). # Android 13 (API 33) apps shouldn't declare READ_EXTERNAL_STORAGE. # On API 19 (Android 4.4), the external storage permission is: # (1) needed for opening epubs from a file manager, # (2) automatically propagated throughout sharedUserId (if one of your apps has it then they will all get it), # (3) persists until the next reboot if you reinstall your apps without it. # Points 2 and 3 can make developers think it's not really needed :-( if pleco_hanping or tts_js: android_manifest+=b"\n" if pleco_hanping: android_manifest += br""" """ if tts_js: android_manifest += br""" """ android_manifest+=b"\n" android_manifest += br"""

""" if epub: android_manifest += br""" """ android_manifest += b"\n

\n" android_layout = br""" """ if android_template == "blank": android_template = B(r"""

"""+app_name+r"

URL_BOX_GOES_HERE") elif android_template: android_template = open(android_template,'rb').read() if not b" """ # Now all other controls: if epub: android_url_box += br""" """ android_url_box += br""" """ # API 19 (4.4) and below has no browser updates. API 17 (4.2) and below has known shell exploits for CVE-2012-6636 which requires only that a site (or network access point) can inject arbitrary Javascript into the HTTP stream. Not sure what context the resulting shell runs in, but there are probably escalation attacks available. TODO: insist on working offline-only on old versions? android_url_box += b'' if android_template: android_template = android_template.replace(b"URL_BOX_GOES_HERE",android_url_box) if not b"VERSION_GOES_HERE" in android_template: android_template = android_template.replace(b"document.write('

%%DATE%% version

"+e.toString()+"

Could this be a permissions problem?").getBytes())); } catch (SecurityException e) { return new WebResourceResponse("text/html","utf-8",new ByteArrayInputStream(("Insufficient permissions to open "+epubUrl+"

"+e.toString()).getBytes())); } java.util.zip.ZipEntry ze; try { ByteArrayOutputStream f=null; if(part==null) { f=new ByteArrayOutputStream(); String fName=epubUrl; int slash=fName.lastIndexOf("/"),slash2=fName.lastIndexOf("%2F"); if(slash2>slash) slash=slash2+2; if(slash>-1) fName=fName.substring(slash+1); f.write(("

"+fName+"

Until I write a real table-of-contents handler, you have to make do with this:").getBytes()); } boolean foundHTML = false; // doubles as 'foundPart' if getNextPage while ((ze = zin.getNextEntry()) != null) { if (part==null) { if(ze.getName().contains("toc.xhtml")) return new WebResourceResponse("text/html","utf-8",new ByteArrayInputStream(("Loading... ").getBytes())); // TODO: we should really be getting this via content.opf which is ref'd in META-INF/container.xml "+ze.getName()+"").getBytes()); } } else if (ze.getName().equalsIgnoreCase(part)) { if(getNextPage) { foundHTML = true; } else { int bufSize=2048; if(ze.getSize()==-1) { f=new ByteArrayOutputStream(); } else { bufSize=(int)ze.getSize(); f=new ByteArrayOutputStream(bufSize); } byte[] buf=new byte[bufSize]; int r; while ((r=zin.read(buf))!=-1) f.write(buf,0,r); String mimeType=android.webkit.MimeTypeMap.getSingleton().getMimeTypeFromExtension(android.webkit.MimeTypeMap.getFileExtensionFromUrl(ze.getName())); if(mimeType==null || mimeType.equals("application/xhtml+xml")) mimeType="text/html"; // needed for annogen style modifications if(mimeType.equals("text/html")) { // TODO: if ((epubUrl.startsWith("file:") || epubUrl.contains("com.android.externalstorage")) && part!="toc.xhtml") then getSharedPreferences putString("eR"+epubUrl,part) ? To avoid unbounded buildup, need to store only the most recent few (use one pref with separators? or other mechanism e.g. 0=url 1=url ... nxtWrite=2 w. wraparound?) Then add "jump to last seen page" link from both directory and toc.xhtml (latter will need manipulation as below) return new WebResourceResponse(mimeType,"utf-8",new ByteArrayInputStream(f.toString().replaceFirst("","

Next").getBytes())); // TODO: will f.toString() work if f is utf-16 ? } else return new WebResourceResponse(mimeType,"utf-8",new ByteArrayInputStream(f.toByteArray())); } } else if(foundHTML && ze.getName().contains("htm")) return new WebResourceResponse("text/html","utf-8",new ByteArrayInputStream(("Loading... ").getBytes())); } if(part==null) { if(!foundHTML) f.write(("

Error: No HTML files were found in this EPUB").getBytes()); return new WebResourceResponse("text/html","utf-8",new ByteArrayInputStream(f.toByteArray())); } else if(foundHTML) return new WebResourceResponse("text/html","utf-8",new ByteArrayInputStream(("No more pages").getBytes())); else return new WebResourceResponse("text/html","utf-8",new ByteArrayInputStream(("No zip entry for "+part+" in "+epubUrl).getBytes())); } catch (IOException e) { return new WebResourceResponse("text/html","utf-8",new ByteArrayInputStream("IOException".getBytes())); } finally { try { zin.close(); } catch(IOException e) {} } }""" if android_print: android_print_script = br"""if(ssb_local_annotator.canPrint())document.write("""+sort20px(br"""''""")+br"""+ssb_local_annotator.canPrint().replace('0.3ex','0.3ex;display:inline-block')+'')""" else: android_print_script = b"" if epub and android_print: android_src = android_src.replace(b"Next",b"Next") if not android_template: android_src += br""" float scale = 0; boolean scaling = false; public void onScaleChanged(final WebView view,float from,final float to) { if (AndroidSDK < 19 || !view.isShown() || scaling || Math.abs(scale-to)<0.01) return; scaling=view.postDelayed(new Runnable() { public void run() { view.evaluateJavascript("document.body.style.width=((window.visualViewport!=undefined?window.visualViewport.width:window.innerWidth)-getComputedStyle(document.body).marginLeft.replace(/px/,'')*1-getComputedStyle(document.body).marginRight.replace(/px/,'')*1)+'px';window.setTimeout(function(){document.body.scrollLeft=0},400)",null); // window.outerWidth will still be excessive on 4.4; not sure there's much we can do about that scale=to; scaling=false; } }, 100); }""" android_src += br""" public void onPageFinished(WebView view,String url) { if(AndroidSDK < 19) // Pre-Android 4.4, so below runTimer() alternative won't work. This version has to wait for the page to load entirely (including all images) before annotating. Also handles displaying the forward button when needed (4.4+ uses different logic for this in onKeyDown, because API19+ reduces frequency of scans when same length, due to it being only a backup to MutationObserver) browser.loadUrl("javascript:"+js_common+"function AnnotMonitor() { AnnotIfLenChanged();if(!document.doneFwd && ssb_local_annotator.canGoForward()){var e=document.getElementById('annogenFwdBtn');if(e){e.style.display='inline';document.doneFwd=1}}window.setTimeout(AnnotMonitor,1000)} AnnotMonitor()"); else browser.evaluateJavascript(js_common+"AnnotIfLenChanged(); var m=window.MutationObserver;if(m)new m(function(mut){var j;if(mut.length==1)for(j=0;j= 3 && AndroidSDK < 14) { /* (we have our own zoom functionality on API 14+ which works better on 19+) */ browser.getSettings().setBuiltInZoomControls(true); } if (AndroidSDK < 14) { final int size=Math.round(16*fs); browser.getSettings().setDefaultFontSize(size); browser.getSettings().setDefaultFixedFontSize(size); }""" else: android_src += br""" if(AndroidSDK >= 3) browser.getSettings().setBuiltInZoomControls(true); float fs = getResources().getConfiguration().fontScale; // from device accessibility settings if (fs < 1.0f) fs = 1.0f; // bug in at least some versions of Android 8 returns 0 for fontScale final int size=Math.round(16*fs); // from device accessibility settings (might be squared if OS does it too, but that's OK because the settings don't give enough of a range) browser.getSettings().setDefaultFontSize(size); browser.getSettings().setDefaultFixedFontSize(size);""" android_src += br""" browser.getSettings().setDefaultTextEncodingName("utf-8"); runTimerLoop(); if (savedInstanceState!=null) browser.restoreState(savedInstanceState); else if (!handleIntent(getIntent())) browser.loadUrl("%%ANDROID-URL%%"); } @Override public void onNewIntent(Intent intent) { super.onNewIntent(intent); handleIntent(intent); } boolean handleIntent(Intent intent) { if(browser==null) return false; if (Intent.ACTION_SEND.equals(intent.getAction()) && "text/plain".equals(intent.getType())) { sentText = intent.getStringExtra(Intent.EXTRA_TEXT); if (sentText == null) return false; browser.loadUrl("javascript:document.close();document.noBookmarks=1;document.rubyScriptAdded=0;document.write(''+ssb_local_annotator.getSentText().replace(/&/g,'&').replace(/'+p1+''}).replace('\\n','
'));"""+android_print_script+br""""); } else if (Intent.ACTION_VIEW.equals(intent.getAction())) { String url=intent.getData().toString();""" if epub: android_src += br""" if (((url.startsWith("file:") || url.startsWith("content:")) && url.endsWith(".epub")) || "application/epub+zip".equals(intent.getType())) openEpub(url); else""" android_src += br""" loadingWait(url); } else return false; return true; } void loadingWait(String url) { browser.loadUrl("javascript:document.close();document.noBookmarks=1;document.write('Loading, please wait...')"); browser.loadUrl(url); } String sentText = null;""" if epub: android_src += br""" void openEpub(String url) { if(AndroidSDK<11 && url.endsWith(".epub")) { browser.loadUrl("javascript:document.close();document.noBookmarks=1;document.rubyScriptAdded=0;document.write('This app'+\"'s EPUB handling requires Android 3 or above :-(\")"); return; } // (Support for Android 2 would require using data URIs for images etc, and using shouldOverrideUrlLoading on all links) // Android 5+ content:// URIs expire when the receiving Activity finishes, so we won't be able to add them to bookmarks (unless copy the entire epub, which is not good on a space-limited device) SharedPreferences sp=getPreferences(0); android.content.SharedPreferences.Editor e; do { e=sp.edit(); e.putString("epub",url); } while(!e.commit()); loadingWait("http://epub/"); // links will be absolute; browser doesn't have to change } @Override protected void onActivityResult(int request, int result, Intent intent) { if(request!=8778 || intent==null || result!=-1) return; boolean isEpub=false; try{byte[] buf=new byte[58]; getContentResolver().openInputStream(Uri.parse(intent.getData().toString())).read(buf,0,58); isEpub=buf[0]=='P' && buf[1]=='K' && buf[2]==3 && buf[3]==4 && new String(buf,30,28).equals("mimetypeapplication/epub+zip"); }catch(Exception e){} if(isEpub) openEpub(intent.getData().toString()); else {Toast.makeText(this, "That wasn't an EPUB file :-(",Toast.LENGTH_LONG).show();} }""" if pleco_hanping: android_src += br""" int dictionaries = 0; boolean gotPleco = false; String[] hanpingPackage = new String[]{"com.embermitre.hanping.cantodict.app.pro","com.embermitre.hanping.app.pro","com.embermitre.hanping.app.lite"}; int[] hanpingVersion = new int[]{0,0,0};""" if tts_js: android_src += br""" String ttsList = ""; TextToSpeech tts=null,tts2=null, tts_keep=null; int found_dx=-1; int nextID=0; @TargetApi(21) boolean doTTS(String text) { if(tts_keep==null) return false; int maxLen=TextToSpeech.getMaxSpeechInputLength(); int queueMode = TextToSpeech.QUEUE_FLUSH; while(text.length() > 0) { String t2; if (text.length() > maxLen) { t2=text.substring(0,maxLen); if(t2.indexOf("\u3002")>0) t2=t2.substring(0,t2.lastIndexOf("\u3002")+1); else if(t2.indexOf(". ")>0) t2=t2.substring(0,t2.lastIndexOf(". ")+2); } else t2 = text; if(tts_keep.speak(t2,queueMode,null,String.valueOf(nextID++))!=TextToSpeech.SUCCESS) return false; text = text.substring(t2.length()); queueMode = TextToSpeech.QUEUE_ADD; } return true; } java.util.List eiList = null; @TargetApi(21) String TTSTest(int batchNo,String voices_to_set) { final android.content.Context context = this; if (batchNo==1) { if (ttsList != "") return ttsList; if(AndroidSDK < 21) return "Android 5+ required for multilingual TTS"; if(eiList==null) { tts = new TextToSpeech(context,new TextToSpeech.OnInitListener(){ public void onInit(int status) { if(tts == null) { ttsList += "race-condition fail"; return; } if (status != 0) { ttsList += "init fail"; return; } eiList = tts.getEngines(); try { tts.shutdown(); } catch(Exception e) {} tts = null; TTSTest(1,voices_to_set); } }); return "Fetching engine list"; } ttsList="TTS voice list:\n"; } int i=0; boolean found=false; for(TextToSpeech.EngineInfo ei : eiList) { if (++i < batchNo) continue; found = true; tts2 = new TextToSpeech(context,new TextToSpeech.OnInitListener(){ public void onInit(int status) { if (tts2 == null) { ttsList += "(engine race-condition fail)"; return; } if (status != 0) { ttsList += "(engine init fail)"; return; } boolean do_shutdown = true; java.util.Set voices; try { voices=tts2.getVoices(); } catch(Exception e) { voices=null; } if(voices==null /* (either by exception or otherwise, e.g. on somebody's Android 11 getVoices() simply returned null) */) { ttsList += "(getVoices fail)"; return; } for(Voice v: voices) { ttsList += v.getName()+"(lang="+v.getLocale().getLanguage()+" variant="+v.getLocale().getVariant()+" quality="+String.valueOf(v.getQuality())+" connection="+(v.isNetworkConnectionRequired()?"t":"f")+" latency="+String.valueOf(v.getLatency())+")\n"; int dx=voices_to_set.indexOf(","+v.getName()+","); if (dx>-1 && (found_dx==-1 || dx < found_dx) && tts2.setVoice(v)==TextToSpeech.SUCCESS) { if(tts_keep!=null && do_shutdown) try { tts_keep.shutdown(); } catch(Exception e) {} // != tts2 tts_keep = tts2; do_shutdown=false; found_dx = dx; } } if (do_shutdown) try { tts2.shutdown(); } catch(Exception e) {} TTSTest(batchNo+1,voices_to_set); } },ei.name); break; // we have to wait for 1st tts2 to be processed before starting next, hence batchNo } if(!found) { ttsList += "scan complete"; return ttsList; } else return "Scanning engines"; }""" android_src += br""" static final String js_common="""+b'"'+jsAnnot()+br""""; @SuppressWarnings("deprecation") @TargetApi(19) void runTimerLoop() { if(AndroidSDK >= 19) { // on Android 4.4+ we can do evaluateJavascript while page is still loading (useful for slow-network days) - but setTimeout won't usually work so we need an Android OS timer final Handler theTimer = new Handler(Looper.getMainLooper()); theTimer.postDelayed(new Runnable() { @Override public void run() { final Runnable r = this; runOnUiThread(new Runnable() { @Override public void run() { browser.evaluateJavascript(((needJsCommon>0)?js_common:"")+"AnnotIfLenChanged()",new android.webkit.ValueCallback() { @Override public void onReceiveValue(String s) { theTimer.postDelayed(r,(s!=null && s.contains("sameLen"))?5000:1000); // s.equals("\"sameLen\"", is this true in all versions of the API?) } }); if(needJsCommon>0) --needJsCommon; } }); } },0); } } boolean nextBackHides = false, _isFocused = true; int needJsCommon=3; @Override public void onPause() { super.onPause(); nextBackHides = _isFocused = false; } // but may still be visible on Android 7+, so don't pause the browser yet @Override public void onResume() { _isFocused = true; super.onResume(); } @TargetApi(11) @Override public void onStop() { super.onStop(); if(browser!=null && AndroidSDK >= 11) browser.onPause(); } // NOW pause the browser (screen off or app not visible) @TargetApi(11) @Override public void onStart() { super.onStart(); if(browser!=null && AndroidSDK >= 11) browser.onResume(); } @Override public boolean onKeyDown(int keyCode, KeyEvent event) { if (keyCode == KeyEvent.KEYCODE_BACK) { if (nextBackHides) { nextBackHides = false; if(moveTaskToBack(true)) return true; } if (browser!=null && browser.canGoBack()) { final String fwdUrl=browser.getUrl(); browser.goBack(); if(AndroidSDK<19) return true; // before Android 4.4 we can't evaluateJavascript, and unclear if we can loadUrl javascript: when we don't have onPageFinished on back, but AnnotMonitor runs at a higher frequency so we let that do it instead of this needJsCommon=3; final Handler theTimer=new Handler(Looper.getMainLooper()); theTimer.postDelayed(new Runnable() { int tried=0; @Override public void run() { if(++tried==9) return; runOnUiThread(new Runnable() { @Override public void run() { if(browser.getUrl().equals(fwdUrl)) { // not yet finished going back theTimer.postDelayed(this,500); } else browser.evaluateJavascript("function annogenMakeFwd(){var e=document.getElementById('annogenFwdBtn'); if(e) e.style.display='inline'; else window.setTimeout(annogenMakeFwd,1000)}annogenMakeFwd()",null); }}); } },500); return true; } } return super.onKeyDown(keyCode, event); } @SuppressWarnings("deprecation") // using getText so works on API 1 (TODO consider adding a version check and the more-modern alternative android.content.ClipData c=((android.content.ClipboardManager)getSystemService(android.content.Context.CLIPBOARD_SERVICE)).getPrimaryClip(); if (c != null && c.getItemCount()>0) return c.getItemAt(0).coerceToText(this).toString(); return ""; ) @TargetApi(11) public String readClipboard() { if(AndroidSDK < Build.VERSION_CODES.HONEYCOMB) // SDK_INT requires API 4 but this works on API 1 return ((android.text.ClipboardManager)getSystemService(android.content.Context.CLIPBOARD_SERVICE)).getText().toString(); android.content.ClipData c=((android.content.ClipboardManager)getSystemService(android.content.Context.CLIPBOARD_SERVICE)).getPrimaryClip(); if (c != null && c.getItemCount()>0) { return c.getItemAt(0).coerceToText(this).toString(); } return ""; } @Override protected void onSaveInstanceState(Bundle outState) { if(browser!=null) browser.saveState(outState); } @Override protected void onDestroy() {""" if tts_js: android_src += br"if(tts_keep!=null) try { tts_keep.shutdown(); } catch(Exception e) {}" android_src += br""" if(isFinishing() && AndroidSDK<23 && browser!=null) browser.clearCache(true); super.onDestroy(); } // (Chromium bug 245549 needed this workaround to stop taking up too much 'data' (not counted as cache) on old phones; it MIGHT be OK in API 22, or even API 20 with updates, but let's set the threshold at 23 just to be sure. This works only if the user exits via Back button, not via swipe in Activity Manager: no way to catch that.) @SuppressWarnings("deprecation") // we use Build.VERSION.SDK only if we're on an Android so old that SDK_INT is not available: int AndroidSDK = (android.os.Build.VERSION.RELEASE.startsWith("1.") ? Integer.valueOf(Build.VERSION.SDK) : Build.VERSION.SDK_INT); WebView browser;""" if epub: android_src += b" boolean loadingEpub = false;" android_src += b"}\n" android_bringToFront=br"""package %%JPACKAGE%%; import android.annotation.TargetApi; import android.content.Intent; import android.os.Build; @TargetApi(3) public class BringToFront extends android.app.IntentService { public BringToFront() { super(""); } public BringToFront(String name) { super(name); } @Override protected void onHandleIntent(Intent workIntent) { Intent i = getPackageManager().getLaunchIntentForPackage(getApplicationContext().getPackageName()); i.setFlags(Intent.FLAG_ACTIVITY_NEW_TASK); startActivity(i); } } """ android_clipboard = br"""

Clipboard

waiting for clipboard contents

""" java_src = br"""package %%JPACKAGE%%; import java.io.*; public class Annotator { public Annotator(""" if android: # will need a context param to read from assets java_src += b"android.content.Context context" java_src += b") throws IOException" if zlib: java_src += b",java.util.zip.DataFormatException" java_src += b""" { try { data=new byte[%%DLEN%%]; } catch (OutOfMemoryError e) { throw new IOException("Out of memory! Can't load annotator!"); }""" if android: java_src += b'context.getAssets().open("annotate.dat").read(data);' else: java_src += b'this.getClass().getResourceAsStream("/annotate.dat").read(data);' if zlib: java_src += br""" java.util.zip.Inflater i=new java.util.zip.Inflater(); i.setInput(data); byte[] decompressed; try { decompressed=new byte[%%ULEN%%]; } catch (OutOfMemoryError e) { throw new IOException("Out of memory! Can't unpack annotator!"); } i.inflate(decompressed); i.end(); data = decompressed;""" java_src += br"addrLen = data[0] & 0xFF;" if post_normalise: java_src += b""" dPtr = 1; char[] rleDat; try { rleDat = new String(java.util.Arrays.copyOfRange(data,readAddr(),data.length), "UTF-16LE").toCharArray(); } catch (UnsupportedEncodingException e) { // should never happen with UTF-16LE return; } normalisationTable = new char[65536]; int maxRLE = rleDat[0]; char w=0; // Java char is unsigned short for(int cF=0; cF < rleDat.length; cF++) { if(rleDat[cF] <= maxRLE) for(int j=0; j inBytes.length) maxPos = inBytes.length; maxPos -= bytes.length; if(offset>nearbytes) offset-=nearbytes; else offset = 0; while(offset <= maxPos) { boolean ok=true; for(int i=0; i sPos=new java.util.LinkedList(); int c; while(true) { c = data[dPtr++] & 0xFF; if ((c & 0x80)!=0) dPtr += (c&0x7F); else if (c < 20) { int i = switchByte_inner(++c); if(i!=0) dPtr += (int)(data[dPtr+c+i-1]&0xFF); dPtr += c+c; } else switch(c) { case 50: dPtr = readAddr(); break; case 51: { int f = readAddr(), dO=dPtr; dPtr = f; readData() ; dPtr = dO; break; } case 52: return; case 60: { int nBytes = (int)(data[dPtr++]&0xFF) + 1; int i = switchByte_inner(nBytes); dPtr += (nBytes + i * addrLen); dPtr = readAddr(); break; } case 70: s0(); break; case 71: case 74: { int numBytes = data[dPtr++] & 0xFF; while((numBytes--)!=0) o(inBytes[writePtr++]); if(c==74) return; else break; } case 72: case 75: { int numBytes = data[dPtr++] & 0xFF; byte[] annot = readRefStr(); s(); o(""); while((numBytes--)!=0) o(inBytes[writePtr++]); o(""); o(annot); o(""); if(c==75) return; else break; } case 73: case 76: { int numBytes = data[dPtr++] & 0xFF; byte[] annot = readRefStr(); byte[] title = readRefStr(); s(); o(""); while((numBytes--)!=0) o(inBytes[writePtr++]); o(""); o(annot); o(""); if(c==76) return; else break; } case 80: sPos.addFirst(inPtr); break; case 81: inPtr=sPos.removeFirst(); break; case 90: { int tPtr = readAddr(); int fPtr = readAddr();""" if existing_ruby_shortcut_yarowsky: java_src += br""" if (shortcut_nearTest) { dPtr = (tPtr 1) or not labelToJump: # ('not labelToJump' and 'not isFunc' == main program) savePos = len(self.l) self.addOpcode('savepos') else: savePos = None if b"" in byteSeq_to_action_dict and len(byteSeq_to_action_dict) > 1 and len(byteSeq_to_action_dict[b""])==1 and not byteSeq_to_action_dict[b""][0][1] and all((len(a)==1 and a[0][0][:len(byteSeq_to_action_dict[b""][0][0])]==byteSeq_to_action_dict[b""][0][0] and not a[0][1]) for a in itervalues(byteSeq_to_action_dict)): self.addActions(byteSeq_to_action_dict[b""][0][0]) l = len(byteSeq_to_action_dict[b""][0][0]) byteSeq_to_action_dict = dict((x,[(y[l:],z)]) for x,[(y,z)] in iteritems(byteSeq_to_action_dict)) del self.l[savePos] ; savePos = None del byteSeq_to_action_dict[b""] self.addActionDictSwitch(byteSeq_to_action_dict) # as a subfunction (ends up adding the call to it, which should be replaced by a jump during compaction; TODO: auto-inline if it turns out there's only this one call to it? other calls might happen if it's merged with an identical one) byteSeq_to_action_dict[b""] = [(b"",[])] # for the end of this func self.addOpcode('return') elif allBytes: allBytes = sorted(list(allBytes)) labels = [self.makeLabel() for b in allBytes+[0]] self.addByteswitch(allBytes,labels) for case in allBytes: self.addLabelHere(labels[0]) ; del labels[0] self.addActionDictSwitch(dict([(k[1:],v) for k,v in iteritems(byteSeq_to_action_dict) if k[:1]==case]),False,labels[-1]) self.addLabelHere(labels[0]) if not savePos==None: self.addOpcode('restorepos') if isFunc: self.addOpcode('return') if self.l[-1]==self.l[-2]: del self.l[-1] # double return return self.finishFunctionAndAddCall() elif b"" in byteSeq_to_action_dict: default_action = b"" for action,conds in byteSeq_to_action_dict[b""]: if conds: if type(conds)==tuple: negate,conds,nbytes = conds else: negate,nbytes = False,ybytes_max assert 1 <= nbytes <= 255, "bytecode supports only single-byte nbytes (but nbytes=0 is reserved for expansion)" trueLabel,falseLabel = self.makeLabel(),self.makeLabel() self.addOpcode('neartest') self.addRef(trueLabel) self.addRef(falseLabel) assert type(nbytes)==int self.addBytes(nbytes) for c in conds: self.addRefToString(c.encode(outcode)) # TODO: how much bytecode could we save by globally merging equivalent lists of string-list references ? (zlib helps anyway but...) if negate: trueLabel,falseLabel = falseLabel,trueLabel self.addLabelHere(trueLabel) self.addActions(action) self.addOpcode('return') self.addLabelHere(falseLabel) else: default_action = action if default_action or not byteSeq_to_action_dict[b""]: self.addActions(default_action) self.addOpcode('return') ; return if labelToJump: self.addOpcode('jump') self.addRef(labelToJump) else: self.addOpcode('return') def makeLabel(self): self.lastLabelNo += 1 return self.lastLabelNo def addLabelHere(self,labelNo): assert type(labelNo)==int assert labelNo, "label 0 not allowed" self.l.append(labelNo) def addRef(self,labelNo): assert type(labelNo)==int self.l.append(-labelNo) def addRefToString(self,string): assert type(string)==bytes, repr(string) l = len(string) if python or java or javascript or dart: # prepends with a length hint if possible (or if not # prepends with 0 and null-terminates it) if js_utf8: string = unicodedata.normalize("NFC",string.decode('utf-8')) # NFC very important for browser_extension: some browsers seem to do it anyway, throwing off data addresses if we haven't accounted for that l = len(string) # we count in UCS-2 characters assert all((ord(c) < 0xD800 or 0xE000 < ord(c) <= 0xFFFF) for c in string), "js_utf8 addressing will be confused by non UCS-2: "+repr(string) # Surrogate pairs would cause invalid UTF-8, don't know which if any Javascript or Dart implementations would take them # Have checked browsers + Node count combining characters separately, so len(string) should be correct (e.g. u'Moc\u0306nik') if 1 <= l < 0x02B0: # can use length-first unichr (avoid combining and modifier marks just in case; also avoid 0xD800+ surrogates) string = unichr(l) + string else: string = unichr(0)+string+unichr(0) elif js_6bit: string = re.sub(b"%(?=[0-9A-Fa-f])|[\x7f-\xff]",lambda m:urllib.quote(m.group()),string) # for JS 'unescape' in readRefStr, which is applied (without encodeURIComponent) if js_6bit and not js_utf8 so we can use %-encoding l = len(string) # length is needed BEFORE %-decode if 1 <= l <= 91: # use 32-122 inclusive string = B(chr(l+31))+string else: # try to avoid using \x00 for termination for termChar in '{|}~\x00': # 123-126 + nul termChar=B(termChar) if not termChar in string: string = termChar + string + termChar break elif 1 <= l < 256: # length byte + string string = B(chr(l))+string else: string = B(chr(0))+string+B(chr(0)) else: string += b'\x00' # just null-termination for C if not string in self.d2l: self.d2l[string] = (-len(self.d2l)-1,) self.l.append(self.d2l[string]) def link(self): # returns resulting bytes # (add an 'end program' instruction before calling) def f(*args): raise Exception("Must call link() only once") self.link = f sys.stderr.write("Linking... ") ; sys.stderr.flush() def dl(t): r = [(x,y) for x,y in iteritems(self.d2l) if type(x)==t] if not t==tuple: r.sort() # so we can optimise for overlaps (but don't let Python 3 try to compare across types, it's more fussy than Python 2) return r d2l = dl(bytes)+dl(unicode)+dl(tuple) # the functions and data to add to the end of self.l assert len(d2l)==len(self.d2l), "missed out a key type" for dat,ref in d2l: assert type(ref)==tuple and type(ref[0])==int self.l.append((-ref[0],)) # the label if type(dat) in [bytes,unicode]: if type(self.l[-2])==type(dat) and self.l[-2][-1]==dat[0]: # overlap of termination-byte indicators (TODO: look for longer overlaps? unlikely to occur) self.l[-2] = self.l[-2][:-1] self.l.append(dat) ; continue # otherwise it's a function, and non-reserved labels are local, so we need to rename them l2l = {} # local label to renamed label for i in dat: if type(i)==int: if i>0: j=i else: j=-i if not j in l2l: l2l[j] = self.makeLabel() if i>0: self.addLabelHere(l2l[j]) else: self.addRef(l2l[j]) else: self.l.append(i) # str or tuple just cp del self.d2l if post_normalise and not javascript: # must be AFTER d2l, as EOF is used to end it normLabel = self.makeLabel() self.l.insert(0,-normLabel) self.l.append(normLabel) bmp = [(k,v) for k,v in sorted(post_normalise.items())] maxRLE = min(bmp[0][0],min(v for k,v in bmp))-1 assert maxRLE >= 0, "can't have a mapping to 0" curPtr = 0 def lsbmsb(i): assert type(i)==int and 0<=i<=0xFFFF return B(chr(i&0xFF)+chr(i>>8)) for i in xrange(len(bmp)): delta = bmp[i][0]-curPtr while delta: skip = min(delta,maxRLE) self.l.append(lsbmsb(skip)) delta -= skip ; curPtr += skip self.l.append(lsbmsb(bmp[i][1])) curPtr += 1 # elements of self.l are now: # - (byte) strings (just copied in) # - positive integers (labels for code) # - negative integers (references to labels) # - +ve or -ve integers in tuples (labels for functions and text strings: different 'namespace') # strings in tuples: opcodes # 1st byte of o/p is num bytes needed per address class TooNarrow(Exception): pass if js_6bit: aBits,aMask = 6,0x3F else: aBits,aMask = 8,0xFF for addrSize in xrange(1,256): sys.stderr.write("(%d-bit) " % (aBits*addrSize)) sys.stderr.flush() src = self.l[:] # must start with fresh copy, because compaction modifies src and we don't want a false start with wrong addrSize to affect us try: compacted = 0 ; compaction_types = set() # The compact opcodes all rely on relative addressing (relative to AFTER the compact instruction) that goes only forward. Easiest way to deal with that is to work backwards from the end, inlining the compactions, before running a conventional 2-pass assembly. # TODO: Could move the below loop into this one in its entirety, and just assemble backwards. Most within-function label references point forwards anyway. (Would still need some backward refs for functions though) bytesFromEnd = 0 lDic = {} # labelNo -> bytesFromEnd def LGet(lRef,origOperandsLen): # Return the number of bytes between the end of the proposed compact instruction and the label, to see if it's small enough to fit inside the compact instruction. Since bytesFromEnd includes origOperandsLen, we need to subtract that out, which would then leave bytes from end of code to end of proposed new instruction (whatever its length will be), and then subtracting the bytesFromEnd of the label will give the number of forward bytes we want. if not -lRef in lDic: return -1 return bytesFromEnd-origOperandsLen-lDic[-lRef] counts_to_del = set() for count in xrange(len(src)-1,-1,-1): i = src[count] if type(i)==tuple and type(i[0])==str: opcode = i[0] i = "-" # for len() at end of block if opcode in ['copyBytes','o','o2'] and src[count+['copyBytes','o','o2'].index(opcode)+2]==('return',): # 74 to 76 = 71 to 73 + return src[count] = B(chr(['copyBytes','o','o2'].index(opcode)+74)) counts_to_del.add(count+['copyBytes','o','o2'].index(opcode)+2) compacted += 1 ; bytesFromEnd -= 1 compaction_types.add('return') elif opcode=='call' and src[count+2]==('return',): src[count] = ('jump',) counts_to_del.add(count+2) compacted += 1 ; bytesFromEnd -= 1 compaction_types.add(opcode) # can't fall through by setting opcode='jump', as the address will be in the function namespace (integer in tuple, LGet would need adjusting) and is highly unlikely to be within range (TODO: unless we try to arrange the functions to make it so for some cross-calls) elif opcode=='jump' and 0 <= LGet(src[count+1],addrSize) < 0x80: # we can use a 1-byte relative forward jump (up to 128 bytes), useful for 'break;' in a small switch offset = LGet(src[count+1],addrSize) if offset == 0: # can remove this jump completely i = "" # for len() at end of block compacted += 1 counts_to_del.add(count) # zap jmp else: src[count] = i = B(chr(0x80 | offset)) # new instr: 0x80|offset counts_to_del.add(count+1) # zap the label compacted += addrSize # as we're having a single byte instead of byte + address bytesFromEnd -= addrSize compaction_types.add(opcode) elif opcode=='switchbyte': numItems = len(src[count+2]) # = ord(src[count+1]) + 1 if 1 <= numItems <= 20: numLabels = numItems+1 # there's an extra default label at the end origOperandsLen = 1+numItems+numLabels*addrSize # number + N bytes + the labels if LGet(src[count+3],origOperandsLen)==0 and all(0 <= LGet(src[count+N],origOperandsLen) <= 0xFF-js_6bit_offset for N in xrange(4,3+numLabels)): # 1st label is immediately after the switchbyte, and all others are in range if javascript or dart: # use printable range if js_6bit and numItems<=17 and all(0x80<=ord(x)<=0xBF or 0xD4<=ord(x)<=0xEF for x in S(src[count+2])): # if bytes being switched on are all from UTF-8 representations of U+0500 through U+FFFF, move to printable range (in one test this saved 780k for the continuation bytes and another 200k for the rest) def mv(x): if x>=0xD4: x -= 20 # or, equivalently, if (x-93)>118, which is done to the input byte in JS before searching on these return B(chr(x-93)) src[count+2]=b''.join(mv(ord(x)) for x in S(src[count+2])) i = B(chr(ord(src[count+1])+91)) # and a printable opcode else: i = B(chr(ord(src[count+1])+108)) # can't make the match bytes printable, but at least we can have a printable opcode 108-127 for short switchbyte in Javascript or Dart else: i = B(src[count+1]) # 0-19 for short switchbyte in C,Java,Python src[count] = i = i+src[count+2]+b''.join(B(chr(LGet(src[count+N],origOperandsLen)+js_6bit_offset)) for N in xrange(4,3+numLabels)) # opcode_including_nItems, string of bytes, offsets (assume 1st offset at count+3 is 0 so not listed) for ctd in xrange(count+1,count+3+numLabels): counts_to_del.add(ctd) newOperandsLen = numItems*2 # for each byte, the byte itself and an offset, + 1 more offset as default, - 1 because first is not given compacted += origOperandsLen-newOperandsLen bytesFromEnd -= origOperandsLen # will add new opCode + operands below compaction_types.add(opcode) elif type(i) in [int,tuple]: # labels if type(i)==int: i2 = i else: i2 = i[0] assert type(i2)==int if i2 > 0: lDic[i] = bytesFromEnd ; i = "" if bytesFromEnd >> (aBits*addrSize+1): raise TooNarrow() # fair assumption (but do this every label, not every instruction) else: i = "-"*addrSize # a reference bytesFromEnd += len(i) src=[s for s,i in zip(src,xrange(len(src))) if not i in counts_to_del] # batched up because del is O(n) # End of opcode compaction lDic = {} # label dictionary: labelNo -> address for P in [1,2]: r = [B(chr(addrSize))] # List to hold the output bytecode, initialised with a byte indicating how long our addresses will be. ll = 1 # cumulative length of output list, normally in bytes, but if js_utf8 then we count in Javascript (UCS-2) characters count = 0 # reading through src opcodes etc while count < len(src): i = src[count] ; count += 1 if type(i)==tuple and type(i[0])==str: i = B(chr(BytecodeAssembler.opcodes[i[0]])) elif type(i) in [int,tuple]: # labels if type(i)==int: i2,iKey = i,-i # +ve integers are labels, -ve integers are references to them else: i2,iKey = i[0],(-i[0],) # reserved labels (a different counter, handled here by putting the key in a tuple) assert type(i2)==int # At this point, if i2<0 then iKey will be the lDic key for looking up the label. if i2 > 0: # label going in here: set lDic etc (without outputting any bytes of course) if (ll >> (aBits*addrSize)): raise TooNarrow() # on the assumption that somebody will reference this label, figure out early that we need more bits if i in lDic: assert lDic[i] == ll, "%s moved %d->%d" % (repr(i),lDic[i],ll) lDic[i] = ll ; i = "" elif iKey in lDic: # known label i = lDic[iKey] # the address to convert to MSB-LSB bytes and output: shift = aBits*addrSize if (i >> shift): raise TooNarrow() j = [] for b in xrange(addrSize): # MSB-LSB (easier to do in JS) shift -= aBits j.append(B(chr(((i>>shift)&aMask)+js_6bit_offset))) i = b"".join(j) assert len(i)==addrSize else: # ref to as-yet unknown label assert P==1, "undefined label %d" % -i i = B("-"*addrSize) # placeholder (well we could just advance ll, but setting this makes things easier if you ever want to inspect partial results) if len(i): # bytes or Unicode r.append(i) ; ll += len(i) sys.stderr.write(".") ; sys.stderr.flush() if js_utf8: # normalise all before join for i in xrange(len(r)): if type(r[i])==bytes: r[i]=unicode(r[i],'latin1') r = u"".join(r) else: r = b"".join(r) if zlib: self.origLen = ll # needed for efficient malloc in the C code later oR,r = r,zlib.compress(r,9) sys.stderr.write("%d bytes (%s compressed from %d after opcode compaction saved %d on %s)\n" % (len(r),zlib_name,ll,compacted,','.join(sorted(list(compaction_types))))) else: sys.stderr.write("%d bytes (opcode compaction saved %d on %s)\n" % (ll,compacted,','.join(sorted(list(compaction_types))))) return r except TooNarrow: pass assert 0, "can't even assemble it with 255-byte addressing !?!" def js_escapeRawBytes(s): if js_utf8: # type(s)==type(u"") s = s.replace("\\",r"\\").replace('"',r'\"').replace(chr(8),r"\b").replace(chr(9),r"\t").replace(chr(10),r"\n").replace(chr(12),r"\f").replace(chr(13),r"\r") if ignore_ie8: s = s.replace(chr(11),r"\v") if js_octal: s = re.sub("[\x00-\x1f](?![0-9])",lambda m:r"\%o"%ord(m.group()),s) else: s = re.sub(chr(0)+r"(?![0-9])",r"\\0",s) # \0 is allowed even if not js_octal (and we need \\ because we're in a regexp replacement) return re.sub(b"[\x00-\x1f\x7f]",lambda m:br"\x%02x"%ord(m.group()),s.encode('utf-8')) elif type(s)==type(u""): # if we're being passed a Unicode string when not js_utf8, then we must be being called from post_normalise and we want \uNNNN output return re.sub("[^\x20-\x7e]",lambda m:r"\u%04x"%ord(m.group()),s).encode('latin1') # otherwise typeof(s)==typeof(b"") s = s.replace(b"\\",br"\\").replace(b'"',br'\"').replace(B(chr(8)),br"\b").replace(B(chr(9)),br"\t").replace(B(chr(10)),br"\n").replace(B(chr(12)),br"\f").replace(B(chr(13)),br"\r") if ignore_ie8: s = s.replace(B(chr(11)),br"\v") if js_octal: s = re.sub(b"[\x00-\x1f](?![0-9])",lambda m:br"\%o"%ord(m.group()),s) else: s = re.sub(b'\x00'+br"(?![0-9])",br"\\0",s) # \0 is allowed even if not js_octal (and we need \\ because we're in a regexp replacement) return re.sub(b"[\x00-\x1f\x7f-\xff]",lambda m:br"\x%02x"%ord(m.group()),s) if not browser_extension: js_start = b'/* Javascript '+version_stamp+br""" Usage: - You could just include this code and then call the annotate() function i.e. var result = annotate(input""" if sharp_multi: js_start += b", annotation_type_number" if glossfile: js_start += b", lines=2" js_start += b")" if not os.environ.get("JS_OMIT_DOM",""): js_start += br""" or, if you're in a browser and have loaded a page, annotate_page(""" if sharp_multi: js_start += b"annotation_type_number" if glossfile: js_start += b"," if glossfile: js_start += b"lines=2" js_start += br""") (run annogen with JS_OMIT_DOM environment variable set if you want to omit the annotate_page code)""" js_start += br""" - Or you could use (and perhaps extend) the Annotator object, and call its annotate() method. If you have Backbone.JS, Annotator will instead be a generator (extending Backbone.Model) which you will have to instantiate yourself (possibly after extending it). The Annotator object/class is also what will be exported by this module if you're using Common.JS. - On Unix systems with Node.JS, you can run this file in "node" to annotate standard input as a simple test. */ """ else: js_start = b"" # browser_extension js_start += b"var Annotator={\n" if not browser_extension: js_start += b" version: '"+version_stamp+b"',\n" if glossfile: js_start += b"numLines: 2 /* override to 1 or 3 if you must, but not recommended for learning */,\n" if known_characters: js_start += b"numKnownGroups: 0 /* override to number of \"known\" groups of characters (words composed entirely of these will be annotated with CSS class 'known') */,\n" if sharp_multi: js_start += b"annotate: function(input,aType) { if(aType==undefined) aType=0;" else: js_start += b"annotate: function(input) {" if removeSpace: js_start += br" input=input.replace(/\B +\B/g,'');" # TODO: document that we do this (currently only in JS annotator here, and Android app via jsAnnot, although Web Adjuster does it separately in Python before calling the filter). It deals with software that adds ASCII spaces between Chinese characters of the same word, without deleting spaces between embedded English words (TODO: this 'JS + app' version may still delete spaces between punctuation characters, which may be an issue for consecutive quoted words e.g. 'so-called "word1" "word2"'). If doing it at the nextbyte level, we'd have to update prevbyte; if this or doing it at switchbyte level (e.g. recurse) we'd have to do something about the copy pointer (skip the spaces?) and the near-call distance (and associated buffer sizes in C) so they're best pre-removed, but only from between characters we annotate. if post_normalise: js_start += br""" var nChars = this.nChars; var origInBytes = unescape(encodeURIComponent(input)); input = input.replace(/./g,function(m){return nChars[m]||m}); if(this.contextL_u8) { this.contextL_u8=unescape(encodeURIComponent(decodeURIComponent(escape(this.contextL_u8)).replace(/./g,function(m){return nChars[m]||m}))); origInBytes = this.contextL_u8 + origInBytes } if(this.contextR_u8) this.contextR_u8=unescape(encodeURIComponent(decodeURIComponent(escape(this.contextR_u8)).replace(/./g,function(m){return nChars[m]||m})));""" js_start += br""" input = unescape(encodeURIComponent(input)); // to UTF-8 var data = this.data""" # TODO: if input is a whole html doc, insert css in head (e.g. from annoclip and/or adjuster), and hope there's no stuff that's not to be annotated (form fields etc). But really want them to be using browser_extension or annotate_page if doing this (TODO add css to annotate_page, already there in browser_extension) if glossfile: js_start += b", numLines = this.numLines" if known_characters: js_start += b", numKnownGroups = this.numKnownGroups, hFreq = this.hFreq" js_start += br"""; var addrLen = data.charCodeAt(0), dPtr; var p = 0; // read-ahead pointer if(this.contextL_u8) { var cL=this.contextL_u8; input = cL+input; p=cL.length } var inputLength = input.length; if(this.contextR_u8) input += this.contextR_u8; var copyP = p; // copy pointer var output = new Array(), needSpace = 0; function readAddr() { var i,addr=0; for (i=addrLen; i; i--) addr=(addr << """ if js_6bit: js_start += b"6) | (data.charCodeAt(dPtr++)-"+B(str(js_6bit_offset))+b");" else: js_start += b"8) | data.charCodeAt(dPtr++);" js_start += br""" return addr; } function readRefStr() { var a = readAddr(); var l=data.charCodeAt(a);""" if js_6bit and not js_utf8: js_start += br""" if(l && l<123) a = data.slice(a+1,a+l-30); else a = data.slice(a+1,data.indexOf(data.charAt(a),a+1));""" else: js_start += br""" if (l != 0) a = data.slice(a+1,a+l+1); else a = data.slice(a+1,data.indexOf('\x00',a+1));""" if js_utf8: js_start += b"return unescape(encodeURIComponent(a))" # Unicode to UTF-8 (TODO: or keep as Unicode? but copyP things will be in UTF-8, as will the near tests) elif js_6bit: js_start += b"return unescape(a)" # %-encoding else: js_start += b"return a" js_start += br"""} function s() { if (needSpace) output.push(" "); else needSpace=1; // for after the word we're about to write (if no intervening bytes cause needSpace=0) } function readData() { var sPos = new Array(), c; while(1) { c = data.charCodeAt(dPtr++); if (c & 0x80) dPtr += (c&0x7F);""" if js_6bit: js_start += br""" else if (c > 90) { c-=90; var i=-1;if(p118)cc-=20; i=data.slice(dPtr,dPtr+c).indexOf(String.fromCharCode(cc))} if (i==-1) i = c; if(i) dPtr += data.charCodeAt(dPtr+c+i-1)-"""+B(str(js_6bit_offset))+br"""; dPtr += c+c }""" else: js_start += br""" else if (c > 107) { c-=107; var i = ((p>=inputLength)?-1:data.slice(dPtr,dPtr+c).indexOf(input.charAt(p++))); if (i==-1) i = c; if(i) dPtr += data.charCodeAt(dPtr+c+i-1); dPtr += c+c; }""" js_start += br""" else switch(c) { case 50: dPtr = readAddr(); break; case 51: { var f = readAddr(); var dO=dPtr; dPtr = f; readData() ; dPtr = dO; break; } case 52: return; case 60: { var nBytes = data.charCodeAt(dPtr++)+1; var i = ((p>=inputLength)?-1:data.slice(dPtr,dPtr+nBytes).indexOf(input.charAt(p++))); if (i==-1) i = nBytes; dPtr += (nBytes + i * addrLen); dPtr = readAddr(); break; } case 70: if(needSpace) { output.push(' '); needSpace=0; } break; case 71: case 74: { var numBytes = (data.charCodeAt(dPtr++)-34)&0xFF; var base = input.slice(copyP, copyP + numBytes); output.push(base); copyP += numBytes; if(c==74) return; break; } case 72: case 75: { var numBytes = (data.charCodeAt(dPtr++)-34)&0xFF; var annot = readRefStr(); var base = input.slice(copyP, copyP + numBytes); copyP += numBytes; s();""" if glossfile: js_start += br""" switch (numLines) { case 1: output.push(""); output.push(base); output.push(""); break; case 3: output.push(" "); output.push(annot); output.push(""); output.push(base); output.push(""); break; default:""" js_start += br""" output.push(""); output.push(base); output.push(""); output.push(annot); output.push("")""" if glossfile: js_start += b"}" else: js_start += b";" js_start += br""" if(c==75) return; break; }""" if glossfile: js_start += br""" case 73: case 76: { var numBytes = (data.charCodeAt(dPtr++)-34)&0xFF; var annot = readRefStr(); var title = readRefStr(); var base = input.slice(copyP, copyP + numBytes); copyP += numBytes; s(); switch (numLines) { case 1: output.push(""); output.push(base); output.push(""); output.push(""); break; case 3: output.push(""); output.push(title.match(/[^/(;]*/)[0]); output.push(""); output.push(annot); output.push(""); output.push(base); output.push(""); break; default: output.push(""); output.push(base); output.push(""); output.push(annot); output.push("") } if(c==76) return; break; }""" if not js_6bit: js_start = js_start.replace(b"(data.charCodeAt(dPtr++)-34)&0xFF",b"data.charCodeAt(dPtr++)") js_start += br""" case 80: sPos.push(p); break; case 81: p=sPos.pop(); break; case 90: { var tPtr = readAddr(); var fPtr = readAddr(); var nearbytes = data.charCodeAt(dPtr++); var o=p; if (o > nearbytes) o -= nearbytes; else o = 0; var max = p + nearbytes; if (max > input.length) max = input.length; // not inputLength: we include contextR_u8 var tStr = input.slice(o,max); var found = 0; while (dPtr < tPtr && dPtr < fPtr) if (tStr.indexOf(readRefStr()) != -1) { found = 1; break; } dPtr = found ? tPtr : fPtr; break; } default: throw("corrupt data table at "+(dPtr-1)+"/"+data.length+" ("+c+")"); } } } while(p < inputLength) { var oldPos=p; dPtr=1;readData(); if (oldPos==p) { needSpace=0; output.push(input.charAt(p++)); copyP++; } } output=decodeURIComponent(escape(output.join("")));""" if known_characters: js_start += br""" if(numKnownGroups) output=output.replace(new RegExp("(["+hFreq.slice(0,numKnownGroups).join('')+"]+.*?)",'g'),"$1 class=known$2"); """ # TODO: pre-cache hFreq regex on select (like in Android)? js_start += br""" return output""" if js_6bit: js_start = js_start.replace(b"var numBytes = data.charCodeAt(dPtr++);",b"var numBytes = (data.charCodeAt(dPtr++)-"+B(str(js_6bit_offset-1))+b")&0xFF;") if sharp_multi: js_start += br""".replace(new RegExp("()"+"[^#]*#".repeat("""+annotMap("aType")+br""")+"(.*?)(#.*?)?, but this regexp will also work if someone changes the generated code to put annotation into second and title into as long as annotation is not given first. Cannot put [^#<] as there might be ^{etc in the annotation, and .*?# still matches across ...}... :-( js_start += br"""; // from UTF-8 back to Unicode }""" # end of annotate method if post_normalise: js_start += b',\nnChars:(Object.fromEntries?Object.fromEntries:function(e){o={};Object.keys(e).forEach(function(k){[k,v]=e[k];o[k]=v});return o})(function(){var t="'+js_escapeRawBytes(u''.join(unichr(c) for c in post_normalise.values()))+b'".split("");return "'+js_escapeRawBytes(u''.join(unichr(c) for c in post_normalise.keys()))+b'".split("").map(function(e,i){return [e,t[i]]})}())' if known_characters: js_start += b",\nhFreq: "+knownCharsGroupsArray if not browser_extension: js_start += b",\n" # data: ... \n goes here (browser_extension reads it from annotate-dat.txt instead) if post_normalise: js_start = js_start.replace(b"input.slice(copyP",b"origInBytes.slice(copyP").replace(b"push(input.charAt",b"push(origInBytes.charAt") js_end = br"""}; function annotate(input""" if sharp_multi: js_end += b",aType" if glossfile: js_end += b",numLines" if known_characters: js_end += b",numKnownGroups" js_end += b""",contextL_u8,contextR_u8) { Annotator.contextL_u8=contextL_u8; Annotator.contextR_u8=contextR_u8; """ if glossfile: js_end += b"if(numLines==undefined) numLines=2; Annotator.numLines=numLines; " if known_characters: js_end += b"Annotator.numKnownGroups=numKnownGroups; " js_end += b"return Annotator.annotate(input" if sharp_multi: js_end += b",aType" js_end += b")}" if browser_extension: if gecko_id: if not gecko_id.startswith("{") or not gecko_id.endswith("}"): gecko_id = "{" + gecko_id + "}" else: gecko_id = "" if manifest_v3: js_end += br""" function restoreOld(numLines,aType) { for(let c of Array.prototype.slice.call(document.getElementsByClassName("_adjust0"))) if(c.oldTxt) c.parentNode.replaceChild(document.createTextNode(c.oldTxt),c); else if(c.oldHtml)c.parentNode.replaceChild(new DOMParser().parseFromString(c.oldHtml,"text/html").body.firstChild.cloneNode(true),c); document.annotWalkOff=(numLines==1);document.aType=aType;annotWalk(document,document) }""" else: js_end += br""" if(localStorage.aType===undefined) localStorage.aType=0; if(localStorage.numLines===undefined) localStorage.numLines=2; var aType=localStorage.aType,numLines=localStorage.numLines,numKnownGroups=localStorage.numKnownGroups;""" # TODO: some of this can be omitted if not sharp_multi, not glossfile or not known_characters, and similarly above and below. Low priority because this part is for browser extension and all three of these will likely be on. js_end += br"""function handleMessage(request, sender, sendResponse) {""" if manifest_v3: js_end += br""" chrome.storage.local.get(["aType"],(aType)=>{ chrome.storage.local.get(["numLines"],(numLines)=>{ chrome.storage.local.get(["numKnownGroups"],(numKnownGroups)=>{ aType=aType["aType"];numLines=numLines["numLines"];numKnownGroups=numKnownGroups["numKnownGroups"]; if(aType===undefined) aType=0; if(numLines===undefined) numLines=2; if(numKnownGroups===undefined) numKnownGroups=0; if(typeof request=='number') { if(request<0) numLines=-request; else if(request-Math.floor(request)>=.9) numKnownGroups=Math.floor(request); else { aType=request; if(numLines==1)numLines=2 } } chrome.storage.local.set({["aType"]: aType, ["numLines"]: numLines, ["numKnownGroups"]: numKnownGroups},()=>{ """ js_end += b"if(typeof request=='number') {" if manifest_v3: js_end += br""" (chrome.tabs && chrome.tabs.query?chrome.tabs.query:browser.tabs.query)({},function(T){for (let t of T)(chrome.scripting && chrome.scripting.executeScript?chrome.scripting.executeScript:browser.scripting.executeScript)({target:{tabId:t.id,allFrames:true},func: restoreOld, args:[numLines,aType]},()=>{chrome.runtime.lastError})}); // ignore lastError as it's likely to be "cannot access chrome:// URL" if one of the tabs in the extension manager sendResponse(true);""" else: js_end += br""" if(request<0) localStorage.numLines=numLines=-request; else if(request-Math.floor(request)>=.9) localStorage.numKnownGroups=numKnownGroups=Math.floor(request); else {localStorage.aType=aType=request;if(numLines==1)localStorage.numLines=numLines=2} (chrome.tabs && chrome.tabs.query?chrome.tabs.query:browser.tabs.query)({},function(T){for (let t of T)(chrome.tabs && chrome.tabs.executeScript?chrome.tabs.executeScript:browser.tabs.executeScript)(t.id,{allFrames: true, code: 'for(let c of Array.prototype.slice.call(document.getElementsByClassName("_adjust0")))if(c.oldTxt)c.parentNode.replaceChild(document.createTextNode(c.oldTxt),c);else if(c.oldHtml)c.parentNode.replaceChild(new DOMParser().parseFromString(c.oldHtml,"text/html").body.firstChild.cloneNode(true),c);'+(numLines==1?'document.annotWalkOff=1':'document.annotWalkOff=0;document.aType='+aType+';annotWalk(document,document)')})})""" js_end += br""" } else if(typeof request=='boolean') sendResponse(request?(numLines==1?-1:aType):numLines); // status query (used by popup and by initial off/on) else if(request=='g') sendResponse(numKnownGroups); else {""" if not manifest_v3: js_end += br""" if(request==null) request={'t':getClip()};""" js_end += b"sendResponse(numLines>1?annotate(request['t']" if sharp_multi: js_end += b",aType" if glossfile: js_end += b",numLines" if known_characters: js_end += b",numKnownGroups" js_end += b",request['l'],request['r']):request['t'])}}" if manifest_v3: js_end += br""")})})}); return true}""" else: js_end += br""" function getClip(){var area=document.createElement("textarea"); document.body.appendChild(area); area.focus();area.value='';document.execCommand("Paste");var txt=area.value; document.body.removeChild(area); return txt?txt:"Failed to read clipboard"}""" js_end += br"""fetch((typeof browser!='undefined'&&browser.runtime&&browser.runtime.getURL?browser.runtime.getURL:chrome."""+(b"runtime" if manifest_v3 else b"extension")+br""".getURL)("annotate-dat.txt")).then(function(r){r.text().then(function(r){Annotator.data=r;chrome.runtime.onMessage.addListener(handleMessage)})})""" # if not js_utf8, having to encode latin1 as utf8 adds about 25% to the file size, but text() supports only utf8; could use arrayBuffer() instead, but inefficient to read w. DataView(buf,offset,1), or could reinstate zlib (probably using base64 read in from file: would probably need to include a versioned unzip library instead of inline-minified subset) elif not os.environ.get("JS_OMIT_DOM",""): js_end += br""" function annotate_page(""" if sharp_multi: js_end += b"aType" if glossfile: js_end += (b"" if js_end.endswith(b"(") else b",") + b"numLines" if known_characters: js_end += (b"" if js_end.endswith(b"(") else b",") + b"numKnownGroups" js_end += b") { " if glossfile: js_end += b"if(numLines==undefined) numLines=2; Annotator.numLines=numLines; " if known_characters: js_end += b"Annotator.numKnownGroups=numKnownGroups; " js_end += jsAnnot(False) + br"""return annotWalk(document,document) }""" if not browser_extension: js_end += br""" if (typeof Backbone != "undefined" && Backbone.Model) { Annotator = Backbone.Model.extend(Annotator);""" if sharp_multi: js_end += br""" annotate=function(input,aType) { return new Annotator().annotate(input,aType) }""" else: js_end += br""" annotate=function(input) { return new Annotator().annotate(input) }""" js_end += br""" } if (typeof require != "undefined" && typeof module != "undefined" && require.main === module) { // Node.js command-line test fs=require('fs'); process.stdout.write(annotate(fs.readFileSync('/dev/stdin').toString())); } else if (typeof module != "undefined" && module.exports) { // Common.js module.exports = Annotator; } """ extension_rubycss = b"span._adjust0 ruby{display:inline-table !important;vertical-align:bottom !important;-webkit-border-vertical-spacing:1px !important;padding-top:0.5ex !important;margin:0px !important} span._adjust0 ruby *{display: inline !important;vertical-align:top !important;line-height:1.0 !important;text-indent:0 !important;text-align:center !important;padding-left:0px !important;padding-right:0px !important} span._adjust0 rb{display:table-row-group !important;font-size:100% !important; opacity: 1.0 !important} span._adjust0 rt{display:table-header-group !important;font-size:100% !important;line-height:1.1 !important; opacity: 1.0 !important;font-family: FreeSerif, Lucida Sans Unicode, Times New Roman, serif !important}" if known_characters: extension_rubycss += b"span._adjust0 ruby rt.known{display: none !important}" extension_config=br""" """ # -ve = num lines (if glossfile), +ve = annotNo (if sharp-multi) if glossfile: extension_config += b' ' rangeStart = -3 else: rangeStart = -2 extension_config=extension_config.replace(b'2-line',b'On') extension_config += b'' if sharp_multi and annotation_names and ',' in annotation_names: extension_config += b"".join((b'
' % (num,B(name))) for num,name in enumerate(annotation_names.split(','))) rangeEnd = len(annotation_names.split(',')) else: rangeEnd = 0 if known_characters: extension_config += b'" extension_config += b'

' # Don't want Clipboard button to auto-refresh (and hide the button) in the desktop extension version, since would need to stop the refresh when view is no longer visible + is it really a good idea to timer-paste the clipboard on a desktop when conversion to text could be costly etc + many desktops would dismiss the extension box before letting you switch to another window to change the clipboard (unless it's in a VM) if manifest_v3: extension_confjs = br"""function getClip(){var area=document.createElement("textarea"); document.body.appendChild(area); area.focus();area.value='';document.execCommand("Paste");var txt=area.value; document.body.removeChild(area); return txt?txt:"Failed to read clipboard"}""" else: extension_confjs = b"" extension_confjs += b"function updateClip() {" if manifest_v3: extension_confjs += b"chrome.runtime.sendMessage({'t':getClip()},(function(cr){" else: extension_confjs += b"chrome.runtime.sendMessage(null,(function(cr){" # 'null' gets changed to getClib background-side in v2 extension_confjs += br""" var v=document.getElementById("cr"); v.textContent = ''; // clear if(cr) { try { for(const t of new DOMParser().parseFromString(' '+cr+' ','text/html').body.firstChild.childNodes) v.appendChild(t.cloneNode(true)); var a=v.getElementsByTagName('ruby'),i; for(i=0; i < a.length; i++) if(a[i].title) (function(e){e.addEventListener('click',(function(){alert(e.title)}))})(a[i]) } catch(err) { console.log(err.message) } } }))} function update() { chrome.runtime.sendMessage(false,function(r) {var i;for(i=%d;i;i++){var e=document.getElementById(""+i);if(i==-r)e.setAttribute('disabled','disabled');else e.removeAttribute('disabled')}})""" % rangeStart if rangeEnd: extension_confjs += br"""; chrome.runtime.sendMessage(true,function(r) {for(var i=0;i<%d;i++){var e=document.getElementById(""+i);if(i==r)e.setAttribute('disabled','disabled');else e.removeAttribute('disabled')}})""" % rangeEnd if known_characters: extension_confjs += br"""; chrome.runtime.sendMessage('g',function(r) {document.getElementById("kc").options.selectedIndex=r});document.getElementById("kc").addEventListener("change",function(){chrome.runtime.sendMessage(document.getElementById("kc").options.selectedIndex+.91,function(){})})""" # 'g' = report numKnownGroups, used as selection index (did have 'undefined' for this but not all Chrome versions support that apparently) extension_confjs += b';\nif(document.getElementById("cr").firstChild) updateClip()\n' extension_confjs += b"} update();\n" extension_confjs += b';'.join((b'document.getElementById("%d").addEventListener("click",function(){chrome.runtime.sendMessage(%d,update)})' % (n,n)) for n in xrange(rangeStart,rangeEnd)) extension_confjs += b';document.getElementById("c").addEventListener("click",updateClip)' dart_src = br""" /* Usage ----- If this file is saved as annotator.dart, you can import 'annotator.dart'; and then call the annotate() function.""" if dart_datafile: dart_src += br""" E.g. String result = await annotate(...); (make your function async.) Will read """+B(dart_datafile) dart_src += br""" */ import 'dart:convert';""" if zlib: dart_src += b"import 'dart:io';" dart_src += br""" class _Annotator { static const version="""+b'"'+version_stamp+br""""; int numLines = 2; // override to 1 or 3 if you must, but not recommended for learning""" if dart_datafile: dart_src+=b"\n static String data=null;" else: dart_src+=b"\n static final String data=%%DATA_INIT%%;" dart_src += br""" int addrLen=data.codeUnitAt(0),dPtr; bool needSpace; StringBuffer output; int p, copyP; List inBytes; int inputLength; String annotate(String input""" if sharp_multi: dart_src += br""",[int aType=0]""" dart_src += br""") { inBytes=utf8.encode(input); dPtr=0; inputLength=input.length; p=0; copyP=0; output = StringBuffer(); needSpace = false; while(p < inputLength) { int oldPos=p; dPtr=1;_readData(); if (oldPos==p) { needSpace=false; output.write(String.fromCharCode(inBytes[p++])); copyP++; } } return Utf8Decoder().convert(output.toString().codeUnits)""" if sharp_multi: dart_src += br""".replaceAllMapped(new RegExp("()"+"[^#]*#"*"""+annotMap("aType",True)+br"""+"(.*?)(#.*?)?"${m[1]}${m[2]}0; i--) addr=(addr << 8) | data.codeUnitAt(dPtr++); return addr; } String _readRefStr() { int a=_readAddr(); int l=data.codeUnitAt(a); String r; if (l != 0) r=data.substring(a+1,a+l+1); else r=data.substring(a+1,data.indexOf("\u0000",a+1));""" if js_utf8: dart_src += br""" return String.fromCharCodes(Utf8Encoder().convert(r));""" else: dart_src += b"return r;" dart_src += br""" } void _s() { if(needSpace) output.write(" "); else needSpace=true; // for after the word we're about to write (if no intervening bytes cause needSpace=false) } void _readData() { List sPos=List(); while(true) { int c=data.codeUnitAt(dPtr++); if ((c & 0x80)!=0) dPtr += (c&0x7F); // short jump else if (c > 107) { // short switchbyte c-=107; var i = ((p>=inputLength)?-1:data.substring(dPtr,dPtr+c).indexOf(String.fromCharCode(inBytes[p++]))); if (i==-1) i = c; if(i>0) dPtr += data.codeUnitAt(dPtr+c+i-1); dPtr += c+c; } else switch(c) { case 50: dPtr = _readAddr(); break; case 51: { int f = _readAddr(); int dO=dPtr; dPtr = f; _readData() ; dPtr = dO; break; } case 52: return; case 60: { int nBytes = data.codeUnitAt(dPtr++)+1; int i = ((p>=inputLength)?-1:data.substring(dPtr,dPtr+nBytes).indexOf(String.fromCharCode(inBytes[p++]))); if (i==-1) i = nBytes; dPtr += (nBytes + i * addrLen); dPtr = _readAddr(); break; } case 70: if(needSpace) { output.write(" "); needSpace=false; } break; case 71: case 74: { int numBytes = data.codeUnitAt(dPtr++); output.write(String.fromCharCodes(inBytes.sublist(copyP,copyP+numBytes))); copyP += numBytes; if(c==74) return; break; } case 72: case 75: { int numBytes = data.codeUnitAt(dPtr++); String annot = _readRefStr(); String base = String.fromCharCodes(inBytes.sublist(copyP,copyP+numBytes)); copyP += numBytes; _s(); switch (numLines) { case 1: output.write(""); output.write(base); output.write(""); break; case 3: output.write(" "); output.write(annot); output.write(""); output.write(base); output.write(""); break; default: output.write(""); output.write(base); output.write(""); output.write(annot); output.write(""); } if(c==75) return; break; } case 73: case 76: { int numBytes = data.codeUnitAt(dPtr++); String annot = _readRefStr(); String title = _readRefStr(); String base = String.fromCharCodes(inBytes.sublist(copyP,copyP+numBytes)); copyP += numBytes; _s(); switch (numLines) { case 1: output.write(""); output.write(base); output.write(""); break; case 3: output.write(""); output.write(RegExp("[^/(;]*").matchAsPrefix(title).group(0)); output.write(""); output.write(annot); output.write(""); output.write(base); output.write(""); break; default: output.write(""); output.write(base); output.write(""); output.write(annot); output.write(""); } if(c==76) return; break; } case 80: sPos.add(p); break; case 81: p=sPos.removeLast(); break; case 90: { int tPtr = _readAddr(); int fPtr = _readAddr(); int nearbytes = data.codeUnitAt(dPtr++); int o=p; if (o > nearbytes) o -= nearbytes; else o = 0; var max = p + nearbytes; if (max > inputLength) max = inputLength; String tStr = String.fromCharCodes(inBytes.sublist(o,max)); bool found = false; while (dPtr < tPtr && dPtr < fPtr) if (tStr.indexOf(_readRefStr()) != -1) { found = true; break; } dPtr = found ? tPtr : fPtr; break; } default: throw("corrupt data table at ${dPtr-1}/${data.length} (${c})"); } } } } """ if dart_datafile: dart_src += b"Future annotate(String s,[" else: dart_src += b"String annotate(String s,[" if sharp_multi: dart_src += b"int aType=0," dart_src += b"int numLines=2]) " if dart_datafile: dart_src += b"async " dart_src += b"{ " if dart_datafile: dart_src += b"if(_Annotator.data==null) _Annotator.data=await %%DATA_INIT%%;" dart_src += b"var a=_Annotator(); a.numLines=numLines; return a.annotate(s" if sharp_multi: dart_src += b",aType" dart_src += b"); }\n" if zlib: dart_src = dart_src.replace(b"%%DATA_INIT%%",b"String.fromCharCodes(zlib.decoder.convert(%%DATA_INIT%%))") py_start = b'# Python '+version_stamp+br""" # You can import this module and call annotate(utf8 bytes) # (from multiple threads if desired), # or you can run from the command line on standard input. # annotate has an optional second argument, which can be # 'ruby' (default), 'raw' (annotation only) or 'braces'. # This module is compatible with both Python 2.7 and Python 3. """ py_end = br""" class Annotator: version="""+b'"'+version_stamp+br"""" def __call__(self,inStr,aFormat): if aFormat=="ruby": self.startA,self.midA,self.endA = b"",b"",b"" elif aFormat=="raw": self.startA=self.midA=self.endA = b"" elif aFormat=="braces": self.startA,self.midA,self.endA = b"{",b"|",b"}" else: raise Exception("Unrecognised annotation format "+repr(aFormat)) assert type(inStr)==bytes self.inStr = inStr self.addrLen = ord(data[:1]) self.inputLength = len(inStr) self.p = 0 # read-ahead pointer self.copyP = 0 # copy pointer self.output = [] self.needSpace = 0 while self.p < self.inputLength: oldPos = self.p self.dPtr = 1 ; self.readData() if oldPos == self.p: self.needSpace=0 self.output.append(inStr[self.p:self.p+1]) self.p += 1 ; self.copyP += 1 return b"".join(self.output) def readAddr(self): addr = 0 for i in range(self.addrLen): addr=(addr << 8) | ord(data[self.dPtr:self.dPtr+1]) self.dPtr += 1 return addr def readRefStr(self): a = self.readAddr(); l=ord(data[a:a+1]) if l: return data[a+1:a+l+1] else: return data[a+1:data.index(b'\x00',a+1)] def s(self): if self.needSpace: self.output.append(b" ") else: self.needSpace=1 def readData(self): sPos = [] ; out = self.output while True: d = ord(data[self.dPtr:self.dPtr+1]) ; self.dPtr += 1 if d==50: self.dPtr = self.readAddr() elif d==51: func = self.readAddr() ; dO = self.dPtr self.dPtr = func ; self.readData() ; self.dPtr = dO elif d==52: return elif d==60 or d<20: if d<20: nBytes=d+1 else: nBytes = ord(data[self.dPtr:self.dPtr+1])+1 self.dPtr += 1 if self.p>=len(self.inStr): i = -1 else: i = data[self.dPtr:self.dPtr+nBytes].find(self.inStr[self.p:self.p+1]) ; self.p += 1 if i==-1: i = nBytes if d<20: if i>0: self.dPtr += ord(data[self.dPtr+nBytes+i-1:self.dPtr+nBytes+i]) self.dPtr += nBytes * 2 else: self.dPtr += (nBytes + i * self.addrLen) self.dPtr = self.readAddr() elif d==70: if self.needSpace: out.append(b' ') ; self.needSpace=0 elif d==71 or d==74: numBytes = ord(data[self.dPtr:self.dPtr+1]) self.dPtr += 1 out.append(self.inStr[self.copyP:self.copyP+numBytes]) self.copyP += numBytes if d==74: return elif d==72 or d==75: numBytes = ord(data[self.dPtr:self.dPtr+1]) self.dPtr += 1 annot = self.readRefStr() self.s() if self.startA: out.append(self.startA) out.append(self.inStr[self.copyP:self.copyP+numBytes]) self.copyP += numBytes out.append(self.midA) ; out.append(annot) out.append(self.endA) if d==75: return elif d==73 or d==76: numBytes = ord(data[self.dPtr:self.dPtr+1]) self.dPtr += 1 annot = self.readRefStr() title = self.readRefStr() self.s() if self.startA==b"{": # omit title in braces mode out.append(self.startA) out.append(self.inStr[self.copyP:self.copyP+numBytes]) elif self.startA: out.append(b""); out.append(self.inStr[self.copyP:self.copyP+numBytes]) self.copyP += numBytes out.append(self.midA) ; out.append(annot) out.append(self.endA) if d==76: return elif d==80: sPos.append(self.p) elif d==81: self.p = sPos.pop() elif d==90: tPtr = self.readAddr() fPtr = self.readAddr() nearbytes = ord(data[self.dPtr:self.dPtr+1]) self.dPtr += 1 o = max(self.p-nearbytes,0) maxx = min(self.p+nearbytes,self.inputLength) tStr = self.inStr[o:maxx] found = False while self.dPtr < tPtr and self.dPtr < fPtr: if self.readRefStr() in tStr: found = True ; break if found: self.dPtr = tPtr else: self.dPtr = fPtr elif d>0x80: self.dPtr += d-0x80 else: raise Exception("corrupt data table at "+str(self.dPtr-1)+" ("+str(ord(data[self.dPtr-1:self.dPtr]))+")") def annotate(inStr,p="ruby"): return Annotator()(inStr,p) def main(): import sys ; aFormat = 'ruby' for a in sys.argv[1:]: if a.startswith("--"): aFormat=a[2:] if type("")==type(u""): sys.stdout.buffer.write(annotate(sys.stdin.buffer.read(),aFormat)) # Python 3 else: sys.stdout.write(annotate(sys.stdin.read(),aFormat)) # Python 2 if __name__=="__main__": main() """ # TODO: annotation-type option from command line in py c_zlib = br"""static unsigned char *data=NULL; static void init() { z_stream s; memset(&s,0,sizeof(s)); s.next_in=origData; s.avail_in=%%ZLIBLEN%%; data=malloc(%%ORIGLEN%%); // TODO: check non-NULL s.next_out=data; s.avail_out=%%ORIGLEN%%; inflateInit(&s); inflate(&s, Z_NO_FLUSH); // TODO: check for memory and data-corruption errors inflateEnd(&s); } """ c_datadrive = br""" static unsigned char *dPtr; static int addrLen; #include static unsigned char * readAddr() { size_t i,addr=0; for (i=addrLen; i; i--) addr=(addr << 8) | *dPtr++; return data + addr; } static void readData() { POSTYPE *savedPositions = NULL; size_t numSavedPositions = 0; while(1) { unsigned char c = *dPtr++; if (c & 0x80) dPtr += (c&0x7F); // short relative forward jump (up to 128 bytes from addr after instruction) else if(c < 20) { // switchbyte with short jumps c++; // now c == nBytes unsigned char byte=(unsigned char)NEXTBYTE; int i; for (i=0; i 1000000 and warnPhrases: sys.stderr.write("WARNING: Your corpus needs more phrase delimiters!\nVery long phrases can take a LONG time to process.\n") warnPhrases = False yield y markupPattern = re.compile(re.escape(markupStart)+"(.*?)"+re.escape(markupMid)+"(.*?)"+re.escape(markupEnd),flags=re.DOTALL) wordPattern = re.escape(markupStart)+'.*?'+re.escape(markupEnd) if suffix: suffix = re.compile('(?:'+'|'.join('(?:'+re.escape(i)+')' for i in T(suffix).split(','))+')(?='+re.escape(markupEnd)+r'|\s)') multiWordPattern = re.escape(markupEnd)+".*?"+re.escape(markupStart) # indicates there could be more than one word phrasePattern = re.compile(wordPattern+r'(\s*'+wordPattern+r')*',flags=re.DOTALL+re.UNICODE) wordPattern = re.compile(wordPattern,flags=re.DOTALL) wspPattern = re.compile(r"\s+",flags=re.UNICODE) def annotationOnly(text): ret = [] for w in re.finditer(markupPattern,text): if mreverse: ret.append(w.group(1)) else: ret.append(w.group(2)) return ' '.join(ret) def markDown(text): # Return just the original text, without markup if mreverse: group=r"\2" else: group=r"\1" return re.sub(markupPattern,group,text) def markUp(text,annotation): if mreverse: text,annotation = annotation,text return markupStart + text + markupMid + annotation + markupEnd def status_update(phraseNo,numPhrases,wordsThisPhrase,nRules,phraseLastUpdate,lastUpdate,startTime,coverP,nRej): phraseSec = (phraseNo-phraseLastUpdate)*1.0/(time.time()-lastUpdate) if phraseSec < 100: phraseSecS = "%.1f" % phraseSec else: phraseSecS = "%d" % int(phraseSec) progress = status_prefix + "%s phrase/sec (%d%%/#w=%d) rules=%d cover=%d%%" % (phraseSecS,int(100.0*phraseNo/numPhrases),wordsThisPhrase,nRules,coverP) if warn_yarowsky: progress += (" rej=%d" % nRej) if time_estimate: if phraseNo < 10: phraseMin = phraseSec*60 # current 'instantaneous' speed else: phraseMin = phraseNo*60/(time.time()-startTime) # longer-term average minsLeft = (numPhrases-phraseNo)/phraseMin if minsLeft>60*24: progress += " %dd+" % int(minsLeft/60/24) elif minsLeft>60: progress += " %dh+" % int(minsLeft/60) elif minsLeft: progress += " %dmin+" % minsLeft # (including the + because this is liable to be an underestimate; see comment after the --time-estimate option) if len(progress) + 14 < screenWidth: progress += " (at %02d:%02d:%02d" % time.localtime()[3:6] # clock time: might be useful for checking if it seems stuck if len(progress) + 20 < screenWidth and not clear_eol == " ": # (being able to fit this in can be intermittent) elapsed = time.time() - startTime progress += ", analyse=%d:%02d:%02d" % (elapsed/3600,(elapsed%3600)/60,elapsed%60) progress += ")" sys.stderr.write(progress+clear_eol+"\r") sys.stderr.flush() def read_and_normalise(): global infile, corpus_unistr if infile: infile=openfile(infile) else: infile = sys.stdin if isatty(infile): sys.stderr.write("Reading from standard input\n(If that's not what you wanted, press Ctrl-C and run again with --help)\n") corpus_unistr = getBuf(infile).read().decode(incode) if diagnose and not diagnose in corpus_unistr: diagnose_write(diagnose+" is not present in the corpus, even before normalisation") suppress = True else: suppress = False loaded_from_cache = normalise() # will change corpus_unistr if diagnose and not suppress and not diagnose in corpus_unistr: diagnose_write(diagnose+" was in the corpus before normalisation, but not after") if loaded_from_cache: diagnose_write("You might want to remove "+normalise_cache+' and redo the diagnose') collapsed_separators = ['',"'",u"\u2019"] # TODO: customise def addHyphenReplacements(hTry,w): for r in collapsed_separators: hTry.add(w.replace('-',r)) def normWord(w,allWords): hTry,typo = set(),None if '-' in w: addHyphenReplacements(hTry,w) # if not annot_whitespace, we'll replace any non-hyphenated 'run together' version by the version with the hyphen; that's often the sensible thing to do with pinyin etc (TODO more customisation??) md = markDown(w) if suffix and len(md)>=suffix_minlen: wN = re.sub(suffix,'',w) if not w==wN: hTry.add(wN) # normalise on having the suffix in if not capitalisation: wl = w.lower() # (as long as it's all Unicode strings, .lower() and .upper() work with accents etc) if not w==wl and wl in allWords: # This word is NOT always capitalised. # Could be 'caps if at start of sentence' # (or title-case etc), but might also be # a corpus error, so check numbers. if allWords[wl]*5 < allWords[w] and allWords[wl] <= normalise_debug: typo = (wl,(u"%s (%d instances) overrides %s (%d instances)" % (wl,allWords[wl],w,allWords[w]))) # To simplify rules, make it always lower. w = wl if '-' in w: addHyphenReplacements(hTry,w) wN = re.sub(suffix,'',w) if not w==wN: hTry.add(wN) if annot_whitespace or (keep_whitespace and markDown(w) in keep_whitespace): return w,None,typo r = trySplit(wspPattern,w,md) if r: return r,hTry,typo elif r==False: # no space found in w r = trySplit("-",w,md) # hTry will normalise to putting the hyphen in if there's a without-hyphen version, but if there's a version that splits at the hyphen into separate words, we normalise to that instead as if the hyphen were a space (TODO: optionally?) if r: return r,hTry,typo return w,hTry,typo def trySplit(splitPattern,w,md): if not re.search(splitPattern,w): return False if not splitPattern=="-": # (don't try runTogether on hyphens: that's hTry, as we want to normalise it to keeping the hyphen) for r in collapsed_separators: runTogether = re.sub(splitPattern,r,w) if not capitalisation and not runTogether.lower()==runTogether and runTogether.lower() in allWords: return runTogether.lower() if runTogether in allWords: return runTogether # varying whitespace in the annotation of a SINGLE word: probably simplest if we say the version without whitespace, if it exists, is 'canonical' (there might be more than one with-whitespace variant), at least until we can set relative normalisation authority (TODO) # TODO: do we check for annot[0]+annot[1:].lower() version too ao = annotationOnly(w) if splitPattern=="-": annotList = ao.split("-") else: annotList = ao.split() if len(md.split())==1 and len(annotList) <= len(md): # Try different ways of # assigning each word to chars, and see if any # of these exist in the corpus; if any does, # assume we have "ABC|a bc" <= "A|a BC|bc" type # situations - the latter shouldn't necessarily be # converted into the former, but the former might # be convertible into the latter to simplify rules if capitalisation: annotListLower = annotList else: annotListLower = [w0.lower() for w0 in annotList] for charBunches in different_ways_of_splitting(md,len(annotList)): mwLowerList = [markUp(c,w0) for c,w0 in zip(charBunches,annotListLower)] if "".join(mwLowerList) in cu_lower_nospaces: if not capitalisation: for i in range(len(annotList)): wu = markUp(charBunches[i],annotList[i]) wl = mwLowerList[i] if not wu==wl and not wl in allWords: mwLowerList[i] = wu # restore original caps return "".join(mwLowerList) # TODO: is there ANY time where we want multiword to take priority over the runTogether version above? or even REPLACE multiword occurrences in the corpus with the runTogether version?? (must be VERY CAREFUL doing that) def normBatch(words): r,typoR = [],[] for w in words: w2,hTry,typo = normWord(w,allWords) if hTry: hTry.add(w2.replace('-','')) # in case not already there for h in hTry: if not h==w2 and h in allWords: r.append((h,w2)) if not w==w2: r.append((w,w2)) if typo: typoR.append(typo) return r,typoR def normalise(): global capitalisation # might want to temp change it global corpus_unistr,allWords,cu_lower_nospaces if normalise_cache: try: corpus_unistr = openfile(normalise_cache).read().decode('utf-8') sys.stderr.write("Normalised copy loaded\n") return True # loaded from cache except: pass if (capitalisation and annot_whitespace) or priority_list: return # TODO: might want to normalise at least the word breaks if priority_list (but it loads it anyway if cached) sys.stderr.write("Normalising...");sys.stderr.flush() old_caps = capitalisation if priority_list: capitalisation = True # no point keeping it at False allWords = getAllWords() if removeSpace: corpus_unistr = re.sub(re.escape(markupEnd)+r'\s+'+re.escape(markupStart),(markupEnd+markupStart).replace('\\',r'\\'),corpus_unistr,flags=re.UNICODE) # so getOkStarts works consistently if corpus has some space-separated and some not corpus_unistr = re.sub(re.escape(markupStart)+r'\s+',markupStart.replace('\\',r'\\'),re.sub(r'\s+'+re.escape(markupMid),markupMid.replace('\\',r'\\'),re.sub(re.escape(markupMid)+r'\s+',markupMid.replace('\\',r'\\'),re.sub(r'\s+'+re.escape(markupEnd),markupEnd.replace('\\',r'\\'),corpus_unistr,flags=re.UNICODE),flags=re.UNICODE),flags=re.UNICODE),flags=re.UNICODE) # so we're tolerant of spurious whitespace between delimeters and markup (TODO: do this even if not removeSpace?) if not annot_whitespace: # normalise trailing hyphens e.g. from OCR'd scans: cu0 = corpus_unistr ; ff = 0 for hTry in [1,2]: for w in allWords.keys(): if '-'+aoEnd in w: idx = w.index('-'+aoEnd) if w[:idx].endswith(aoStart) or w[:idx].endswith("-"): continue # ignore this one (a mess of some kind) if hTry==2: # ouch, this doesn't look good getBuf(sys.stderr).write((u" (can't normalise hyphens due to '%s') " % w).encode(terminal_charset,'replace')) ; sys.stderr.flush() corpus_unistr = cu0 ; break if mreverse: grp,mdG=r"-\1",r"\2" else: grp,mdG=r"-\2",r"\1" # TODO: batch up the following replacements by using something similar to Replacer but with a common destination regexp that takes groups from the 'w' entries as well. (Low priority because don't typically get TOO many of these dangling hyphens in most corpuses.) corpus_unistr = re.sub(re.escape(w)+r"\s*"+re.escape(markupStart)+"(.*?)"+re.escape(markupMid)+"(.*?)"+re.escape(markupEnd),w.replace('\\',r'\\').replace('-'+aoEnd.replace('\\',r'\\'),grp+aoEnd.replace('\\',r'\\')).replace(mdEnd.replace('\\',r'\\'),mdG+mdEnd.replace('\\',r'\\')),corpus_unistr,flags=re.DOTALL+re.UNICODE) ff = 1 if ff: allWords = getAllWords() # re-generate del cu0 cu_lower_nospaces = re.sub(wspPattern,"",corpus_unistr) # doesn't matter that spaces inside annotation are also taken out by this, since it's used only for searching for words that don't have spaces in their annotation in the split logic if not capitalisation: cu_lower_nospaces = cu_lower_nospaces.lower() sys.stderr.write(":") ; sys.stderr.flush() tmp = corpus_unistr ; del corpus_unistr numCores = setup_parallelism() corpus_unistr = tmp perCore = int(len(allWords)/numCores)+1 allWL = list(allWords.keys()) ; jobs = [] for c in xrange(numCores-1): jobs.append(executor.submit(normBatch,allWL[c*perCore:(c+1)*perCore])) if numCores>1: allWL=allWL[(numCores-1)*perCore:] results = [normBatch(allWL)] del allWords,cu_lower_nospaces for j in jobs: results.append(j.result()) if cores_command: os.system(cores_command+" 1") sys.stderr.write(".") ; sys.stderr.flush() dic = {} for SR,typos in results: for x,y in SR: if diagnose and diagnose in x: diagnose_write("Changing %s to %s" % (x,y)) dic[x] = y for wl,msg in typos: typo_report("normalise-debug.txt","allow-caps-exceptions.txt",wl,msg) for k in list(dic.keys()): seen = set() while dic[k] in dic: v = dic[dic[k]] assert not v in seen, "normalisation loop!" if v in seen: break seen.add(v) dic[k] = v sys.stderr.write(":") ; sys.stderr.flush() for exp in orRegexes(re.escape(k) for k in iterkeys(dic)): corpus_unistr = re.sub(exp,lambda k:dic[k.group(0)],corpus_unistr) sys.stderr.write(" done\n") if normalise_cache and capitalisation==old_caps: openfile(normalise_cache,'w').write(corpus_unistr.encode('utf-8')) capitalisation = old_caps def getAllWords(): allWords = {} for phrase in splitWords(corpus_unistr,phrases=True): for w in splitWords(phrase): allWords[w]=allWords.setdefault(w,0)+1 return allWords # do NOT cache (call = regenerate) def orRegexes(escaped_keys): escaped_keys = list(escaped_keys) # don't just iterate if not escaped_keys: return # don't yield "" try: yield re.compile('|'.join(escaped_keys)) except OverflowError: # regex too big (e.g. default Python on Mac OS 10.7 i.e. Python 2.7.1 (r271:86832, Jul 31 2011, 19:30:53); probably some Windows versions also; does not affect Mac HomeBrew's Python 2.7.12) ek = escaped_keys[:int(len(escaped_keys)/2)] for r in orRegexes(ek): yield r ek = escaped_keys[len(ek):] for r in orRegexes(ek): yield r def PairPriorities(markedDown_Phrases,existingPris={}): markedDown_Phrases = list(markedDown_Phrases) assert all(type(p)==list for p in markedDown_Phrases) mdwSet = set(existingPris.keys()) for p in markedDown_Phrases: mdwSet.update(p) assert all(type(w)==unicode for w in mdwSet) votes = {} ; lastT = time.time() for pi in xrange(len(markedDown_Phrases)): if time.time() > lastT+2: sys.stderr.write("PairPriorities: %d%%%s\r" % (pi*100/len(markedDown_Phrases),clear_eol)) ; sys.stderr.flush() lastT = time.time() p=markedDown_Phrases[pi] for x in xrange(len(p)-1): a,b = p[x:x+2] combined = a+b for i in xrange(1,len(combined)): if i==len(a): continue elif ib, then a>c for any b>c (c=c but we don't have equality), # and c>b for any c>a. candidate = set([(a,b)]+[(a,c) for c in lessThan.get(b,[])]+[(c,b) for c in gtThan.get(a,[])]) if closure==None: # no longer tracking closure if any(y in gtThan.get(x,{}) for (x,y) in candidate): return # contradiction else: if any((y,x) in closure for (x,y) in candidate): return # contradiction, use higher abs votes closure.update(candidate) for x,y in candidate: # x>y y _cmpT + 2: sys.stderr.write("Finalising: %d/%d%s\r" % (len(r),len(mdwList),clear_eol)) ; sys.stderr.flush() _cmpT=time.time() _cmpW=True if w in tcA: if w==diagnose: f0 = existingPris.get(w,0) found = False for i in xrange(len(r)): W,f = r[i] if (w,W) in trueClosure: found = True if 1+f > f0: diagnose_write("Increasing f(%s) from %d to %d to outweigh %s (f=%d)" % (w,f0,1+f,W,f)) f0 = 1+f else: diagnose_write("f(%s)=%d already outweighs %d for %s" % (w,f0,f,W)) elif (W,w) in trueClosure: found = True diagnose_write("Problem? %s (f=%d) before %s (f=%d)" % (W,f,w,f0)) if not found: diagnose_write("No interactions with %s found among %d lower-priority words" % (w,len(r))) l = [f0-1] else: l = [r[i][1] for i in xrange(len(r)) if (w,r[i][0]) in trueClosure] else: l = [] r.append((w,1+max([existingPris.get(w,0)-1]+l))) if _cmpW: sys.stderr.write("Finalising: done%s\n" % clear_eol) return sorted(r) def skipToNext(thing): return "(?:(?!"+re.escape(thing)+").)*"+re.escape(thing) # not ".*?"+re.escape(thing) as it may absorb one 'thing' to match the rest of the regex later if mreverse: mdStart,mdEnd,mdSplitR,aoStart,aoEnd = markupMid,markupEnd,re.escape(markupEnd)+r'\s*'+re.escape(markupStart)+skipToNext(markupMid),markupStart,markupMid else: mdStart,mdEnd,mdSplitR,aoStart,aoEnd = markupStart,markupMid,re.escape(markupMid)+skipToNext(markupEnd)+r'\s*'+re.escape(markupStart),markupMid,markupEnd mdSplitR="(?:"+mdSplitR+")?" # so can use it in .join(chars) to say "maybe word-break between these chars" def different_ways_of_splitting(chars,numWords): if numWords > len(chars): return elif numWords == len(chars): yield list(chars) ; return elif numWords == 1: yield [chars] ; return spAt_try1 = int(len(chars) / numWords) + 1 for spAt in list(range(spAt_try1,0,-1)) + list(range(spAt_try1+1, len(chars)-numWords+1)): for r in different_ways_of_splitting(chars[spAt:],numWords-1): yield [chars[:spAt]]+r if type(u"")==type(""): # Python 3 getNext = lambda gen: gen.__next__() iterkeys = lambda d: d.keys() itervalues = lambda d: d.values() iteritems = lambda d: d.items() else: # Python 2 getNext = lambda gen: gen.next() iterkeys = lambda d: d.iterkeys() itervalues = lambda d: d.itervalues() iteritems = lambda d: d.iteritems() def yarowsky_indicators(withAnnot_unistr,canBackground): # yields True if rule always works (or in majority of cases with ymajority), or lists enough indicators to cover example instances and yields (negate, list, nbytes), or just list if empty. # (If too few indicators can be found, will list the ones it can, or empty if no clearly-distinguishable indicators can be found within ybytes of end of match.) # yield "backgrounded" = task has been backgrounded; getNext collects result nonAnnot=markDown(withAnnot_unistr) def unconditional_looks_ok(explain): # could we have this as an unconditional rule, with the other cases as exceptions that will be found first? (NB this is not the same thing as a 'default-yes rule with exceptions', this is a rule with NO qualifying indicators either way) if len(nonAnnot)==1: if nonAnnot==diagnose: diagnose_write("%s is default by %s len=1 rule after removing irrelevant badStarts" % (withAnnot_unistr,explain)) return True # should be safe, and should cover most "common short Chinese word with thousands of contexts" cases # If len 2 or more, it's risky because the correct solution could be to process just a fraction of the word now and the rest will become the start of a longer word, so we probably don't want it matching the whole lot by default: we'll want positive or negative indicators instead. # e.g. looking at rule AB, text ABC and correct segmentation is A BC, don't want it to 'greedily' match AB by default without indicators it should do so # Check for no "A BC" situations, i.e. can't find any possible SEQUENCE of rules that STARTS with ALL the characters in nonAnnot and that involves having them SPLIT across multiple words: # (The below might under-match if there's the appearance of a split rule but it actually has extra non-marked-up text in between, but it shouldn't over-match.) # TODO: if we can find the actual "A BC" sequences (instead of simply checking for their possibility as here), and if we can guarantee to make 'phrase'-length rules for all of them, then AB can still be the default. This might be useful if okStarts is very much greater than badStarts. It would require checkCoverage to mark "A" as False if there exists a (no-indicators) "AB" rule. # (TODO: until the above is implemented, consider recommending --ymax-threshold=0 so all ybytes ranges are tried, because, now that Yarowsky-like collocations can be negative, the 'following word' could just go in as a collocation with low ybytes) # TODO: also, if the exceptions to rule AB are always of the form "Z A B", and we can guarantee to generate a phrase rule for "Z A B", then AB can still be default. (We should already catch this when the exceptions are "ZA B", but not when they are "Z A B", and --ymax-threshold=0 probably won't always help here, especially if Z==B; Mandarin "mei2you3" / "you3 mei2 you3" comes to mind) llen = len(mdStart)+len(nonAnnot) regex=re.compile(re.escape(mdStart) + mdSplitR.join(re.escape(c) for c in list(nonAnnot))) if all(x.end()-x.start()==llen for x in re.finditer(regex,corpus_unistr)): if nonAnnot==diagnose: diagnose_write("%s is default by %s rule after checking for dangerous overlaps etc" % (withAnnot_unistr,explain)) return True if nonAnnot==diagnose: diagnose_write("%s cannot be default by %s due to %s" % (withAnnot_unistr,explain,', '.join(list(set(["'"+x.group()+"'" for x in re.finditer(regex,corpus_unistr) if not x.end()-x.start()==llen]))[:5]))) if nonAnnot in yPriorityDic: # TODO: enforce len==1 ? if yPriorityDic[nonAnnot] == withAnnot_unistr: # we want this case to be the default if len(withAnnot_unistr)==1: if nonAnnot==diagnose: diagnose_write("ref-pri forces %s" % (withAnnot_unistr,)) yield True ; return else: if nonAnnot==diagnose: diagnose_write("ref-pri wants %s by default: finding negative indicators only" % (withAnnot_unistr,)) can_be_default = "must" # might not even need to get okStarts, etc if unconditional_looks_ok("ref-pri"): yield True ; return else: if nonAnnot==diagnose: diagnose_write("ref-pri forbids default %s" % (withAnnot_unistr,)) can_be_default = False # another is default, don't make this one default even if it occurs more else: can_be_default = True # First, find positions in corpus_markedDown which match withAnnot_unistr in corpus_unistr okStarts = getOkStarts(withAnnot_unistr) # now check for corpus_markedDown matches that *don't* have withAnnot_unistr badStarts = getBadStarts(nonAnnot,okStarts) if not badStarts: if nonAnnot==diagnose: diagnose_write("%s has no badStarts" % (withAnnot_unistr,)) yield True ; return # rule always works, no Yarowsky-like indicators needed if can_be_default and len(okStarts) > len(badStarts) and len(nonAnnot)==1: if nonAnnot==diagnose: diagnose_write("%s is default by majority-case len=1 rule" % (withAnnot_unistr,)) yield True ; return # duplicate of code below (can test for this case early before reducing-down badStarts) badStarts = getReallyBadStarts(badStarts,nonAnnot) # see its comments (ignore some badStarts) if not badStarts: if nonAnnot==diagnose: diagnose_write("%s has only probably-irrelevant badStarts" % (withAnnot_unistr,)) yield True ; return # Now, if it's right more often than not: if can_be_default==True and len(okStarts) > len(badStarts) and unconditional_looks_ok("majority-case"): # (if can_be_default=="must", we have already checked for unconditional_looks_ok() above before computing okStarts and badStarts) yield True ; return run_in_background = canBackground and len(okStarts) > 500 and executor # In a test with 300, 500, 700 and 900, the 500 threshold was fastest on concurrent.futures, but by just a few seconds. may_take_time = canBackground and len(okStarts) > 1000 if may_take_time: getBuf(sys.stderr).write((u"\nLarge collocation check (%s has %d matches + %s), %s.... \n" % (withAnnot_unistr,len(okStarts),badInfo(badStarts,nonAnnot),"backgrounding" if run_in_background else "could take some time")).encode(terminal_charset,'replace')) if len(badStarts) <= yarowsky_debug: typo_report("yarowsky-debug.txt","allow-exceptions.txt",withAnnot_unistr,(u"%s has %d matches + %s" % (withAnnot_unistr,len(okStarts),badInfo(badStarts,nonAnnot,False)))) if run_in_background: job = executor.submit(yarowsky_indicators_wrapped,withAnnot_unistr) # recalculate the above on the other CPU in preference to passing, as memory might not be shared yield "backgrounded" ; yield job yield job.result() ; return if ybytes_max > ybytes and (not ymax_threshold or len(nonAnnot) <= ymax_threshold): retList = [] ; append=retList.append times = [] for nbytes in range(ybytes,ybytes_max+1,ybytes_step): t = time.time() negate,ret,covered,toCover,nbytes = tryNBytes(nbytes,nonAnnot,badStarts,okStarts,withAnnot_unistr,can_be_default=="must",nbytes==ybytes_max) if covered==toCover and len(ret)==1: if may_take_time: sys.stderr.write(" - using 1 indicator, negate=%s\n" % repr(negate)) yield (negate,ret,nbytes) ; return # a single indicator that covers everything will be better than anything else we'll find append((-int(covered*100/toCover),len(ret),nbytes,negate,toCover,ret)) # (1st 4 of these are the sort keys: maximum coverage to nearest 1%, THEN minimum num indicators for the same coverage, THEN minimum nbytes (TODO: problems of very large nbytes might outweigh having more indicators; break if found 100% coverage by N?), THEN avoid negate) # TODO: try finding an OR-combination of indicators at *different* proximity lengths ? if nbytes>ybytes and ymax_limitwords and nonAnnot in ymax_limitwords: break times.append(time.time()-t) ; t=time.time() if len(times)>2 and sum(times) > 20*60 and not min(i[0] for i in retList)<1.05*min(i[0] for i in retList[:2]): diagnose_write("%s took %d+mins, consider --ymax-limitwords (mins:%s, coverage:%s, indicators:%s, noexpand discards #2+)" % (withAnnot_unistr,sum(times)/60,",".join(str(int(t/60)) for t in times),",".join((str(-i[0])+"%") for i in retList),",".join(str(i[1]) for i in retList)),"Suggestion") ret0 = min(retList,key=lambda x:x[:3]+(str(x[3]),)) # (don't let Python3 try to compare True with 'harder' in x[3] like Python2 does) if nonAnnot==diagnose: diagnose_write("Best coverage is %d%% of %d" % (-ret0[0],ret0[-2])) negate,ret = ret0[-3],ret0[-1] distance = ret0[2] else: negate,ret = tryNBytes(ybytes_max,nonAnnot,badStarts,okStarts,withAnnot_unistr,can_be_default=="must")[:2] if ybytes < ybytes_max: distance = ybytes_max else: distance = None # all the same anyway if not ret and warn_yarowsky: getBuf(sys.stderr).write((u"Couldn't find ANY Yarowsky-like indicators for %s \n" % withAnnot_unistr).encode(terminal_charset,'replace')) # (if nonAnnot==diagnose, this'll be reported by tryNBytes below) # TODO: if partially but not completely covered, shouldn't entirely count the word as 'covered' in analyse() elif ret and may_take_time: sys.stderr.write(" - using %d indicators, negate=%s\n" % (len(ret),repr(negate))) if not ret or (not distance and not negate): yield ret else: if not distance: distance = ybytes_max yield negate,ret,distance typo_data = {} def typo_report(debugFile,exceptionFile,withAnnot_unistr,msg_unistr): if not exceptionFile in typo_data: try: typo_data[exceptionFile]=set(splitWords(openfile(exceptionFile).read().decode(terminal_charset))) except IOError: typo_data[exceptionFile]=set() if withAnnot_unistr not in typo_data[exceptionFile]: if not debugFile in typo_data: typo_data[debugFile] = openfile(debugFile,'w') getBuf(sys.stderr).write(B(bold_on+"Writing to "+debugFile+bold_off+"\n")) getBuf(typo_data[debugFile]).write(B("Put any of the following first-of-line words into %s to avoid being alerted here next time.\n\n" % exceptionFile)) getBuf(typo_data[debugFile]).write((msg_unistr+u"\n").encode(terminal_charset,'replace')) typo_data[debugFile].flush() # in case interrupted def yarowsky_indicators_wrapped(withAnnot_unistr): return getNext(yarowsky_indicators(withAnnot_unistr,False)) def getOkStarts(withAnnot_unistr): if withAnnot_unistr in precalc_sets: return precalc_sets[withAnnot_unistr] walen = len(withAnnot_unistr) return set(x for x in precalc_sets[getNext(splitWords(withAnnot_unistr))] if corpus_unistr[m2c_map[x]:m2c_map[x]+walen]==withAnnot_unistr) def getBadStarts(nonAnnot,okStarts): r = [] ; append=r.append l=len(nonAnnot) k = nonAnnot[:2] if k in bigramCache: for i in bigramCache[k]: if not i in okStarts and corpus_markedDown[i:i+l]==nonAnnot: append(i) return r find = corpus_markedDown.find i = find(nonAnnot) while i != -1: if not i in okStarts: append(i) i = find(nonAnnot,i+l) return r def getReallyBadStarts(badStarts,nonAnnot): # Some of the badStarts can be ignored on the grounds that they should be picked up by other rules first: any where the nonAnnot match does not start at the start of a word (the rule matching the word starting earlier should get there first), and any where it starts at the start of a word that is longer than its own first word (the longest-first ordering should take care of this). So keep only the ones where it starts at the start of a word and that word is no longer than len(nonAnnot). reallyBadStarts = [] ; append=reallyBadStarts.append nonAnnotLen = len(mdStart+nonAnnot+mdEnd) theRe = re.compile(re.escape(mdStart+nonAnnot[0])+".*?"+re.escape(mdEnd),flags=re.DOTALL) for b in badStarts: try: s = m2c_map[b] except KeyError: continue # it wasn't the start of a word (only start positions are in that map) m=theRe.search(corpus_unistr, s) # will either start at s, or after it if mreverse assert m, "m2c_map error? "+repr(nonAnnot[0])+" "+repr(b)+"->"+repr(s)+" not found ( "+repr(corpus_markedDown[b:b+25])+"... -> "+repr(corpus_unistr[s:s+50])+"...)" s,e = m.start(),m.end() if e-s > nonAnnotLen: continue # this word is too long, should be matched by a longer rule 1st append(b) # to reallyBadStarts return reallyBadStarts def tryNBytes(nbytes,nonAnnot,badStarts,okStarts,withAnnot_unistr,force_negate,try_harder=True): # try to find either positive or negative Yarowsky-like indicators, whichever gives a smaller set (or only negative ones if force_negate, used by end_pri yPriorityDic logic). Negative indicators might be useful if there are many matches and only a few special exceptions. (If not force_negate, then negative indicators are used only if they cover 100% of the exceptions; see below re negate==None) def bytesAround(start): return within_Nbytes(start+len(nonAnnot),nbytes) okStrs=list(set(bytesAround(s) for s in okStarts)) badStrs=list(set(bytesAround(s) for s in badStarts)) if nonAnnot==diagnose: inBoth = set(okStrs).intersection(set(badStrs)) if inBoth: diagnose_write("tryNBytes(%d) on %s has contexts that are both OK and bad: %s" % (nbytes,withAnnot_unistr,"/".join(list(inBoth)[:10]))) pOmit = unichr(1).join(badStrs) # omit anything that occurs in this string from +ve indicators nOmit = unichr(1).join(okStrs) # ditto for -ve indicators avoidSelf = unichr(1)+nonAnnot[max(0,len(nonAnnot)-nbytes):].encode(outcode)[-nbytes:].decode(outcode,'ignore') pOmit += avoidSelf ; nOmit += avoidSelf pCovered=[False]*len(okStrs) nCovered=[False]*len(badStrs) n2Covered=[False]*len(badStrs) pRet = [] ; pAppend=pRet.append nRet = [] ; nAppend=nRet.append n2Ret = [] ; nAppend2 = n2Ret.append negate = None # not yet set toCheck = [] ; diagnostics = [] if not force_negate: didFind = [] # for append(True) when something found, used only by diagnostics diagnostics.append((didFind,"",pRet,pCovered)) toCheck.append((didFind,okStrs,pAppend,pCovered,unique_substrings(okStrs,lambda txt:txt in pOmit,lambda txt:sum(1 for s in okStrs if txt in s)))) # a generator and associated parameters for positive indicators diagnose_extra = [] if force_negate or 5*len(okStrs) > len(badStrs) or not okStrs: # and for negative indicators, if appropriate: (changed in v0.6892: still check for negative indicators if len(okStrs) is similar to len(badStrs) even if not strictly greater, but don't bother if len(okStrs) is MUCH less) didFind = [] diagnostics.append((didFind,"negative",nRet,nCovered)) toCheck.append((didFind,badStrs,nAppend,nCovered,unique_substrings(badStrs,lambda txt:txt in nOmit,lambda txt:sum(1 for s in badStrs if txt in s)))) if try_harder and okStrs and not force_negate: didFind = [] diagnostics.append((didFind,"overmatch-negative",n2Ret,n2Covered)) toCheck.append((didFind,badStrs,nAppend2,n2Covered,unique_substrings(badStrs,lambda txt:txt in avoidSelf,lambda txt:(sum(1 for s in badStrs if txt in s),-sum(1 for s in okStrs if txt in s))))) # a harder try to find negative indicators (added in v0.6896): allow over-matching (equivalent to under-matching positive indicators) if it's the only way to get all badStrs covered (v3.264: only don't try creating an indicator from the word itself which would render the rule a no-op). May be useful if the word can occur in isolation. elif nonAnnot==diagnose: diagnose_extra.append("Not checking for negative indicators as 5*%d>%d=%s." % (len(okStrs),len(badStrs),repr(5*len(okStrs)>len(badStrs)))) while toCheck and negate==None: for i in range(len(toCheck)): didFind,strs,append,covered,generator = toCheck[i] try: indicator = getNext(generator) # gets a yield from the corresponding unique_substrings call except StopIteration: del toCheck[i] ; break found = True ; cChanged = False for j in xrange(len(strs)): if not covered[j] and indicator in strs[j]: covered[j]=cChanged=True if cChanged: append(indicator) if not didFind: didFind.append(True) if all(covered): if append==pAppend: negate=False elif append==nAppend: negate=True # negate with no overmatch allowed found all the exceptions, so use it (don't use it if it doesn't find ALL the exceptions, since we don't ever want an as-if 'overmatch positive' i.e. misidentifying a word/phrase in a place where the corpus explicitly DOESN'T have it, unless force_negate see comment below) else: # append==nAppend2 (negate with overmatch allowed): we managed to get all exceptions with overmatch-negative, but how much damage did our overmatching do to the NON-exceptions? fxCover = [True]*len(okStrs) for indicator in n2Ret: for i in xrange(len(okStrs)): if fxCover[i] and indicator in okStrs[i]: # a negative indicator 'misfires' here, resulting in this okStr NOT being identified as 'ok' fxCover[i] = False if sum(1 for x in fxCover if x) >= sum(1 for x in pCovered if x): negate="harder" else: diagnose_extra.append("Overmatch-negate got worse actual coverage than partial-positive.") # so don't set negate="harder", but we might still force_negate to set negate=True below break # and if negate==None AFTER this loop, didn't get all(pCovered) OR all(nCovered), in which case we fall back to negate=False (unless force_negate). In other words, negative indicators normally have to cover ALL non-occurrences to be passed, whereas positive indicators just have to cover SOME. This is in keeping with the idea of 'under-match is better than over-match' (because an under-matching negative indicator is like an over-matching positive one) if force_negate: negate = True if negate==True: ret,covered = nRet,nCovered elif negate=="harder": ret,covered = n2Ret,n2Covered if nbytes>ybytes and all(any(indicator in within_Nbytes(s+len(nonAnnot),nbytes) for indicator in ret)==any(indicator in within_Nbytes(s+len(nonAnnot),ybytes) for indicator in ret) for s in badStarts): # v3.242: we're using overmatch-negate on larger contexts, the smaller context might have failed to consider this compromise indicator due to there being an intersection between badStrs and okStrs when small, so when contexts are enlarged and we found it's the least-bad indicator to use anyway, check if we could then go back to the smaller context with same results (could reduce overmatch in practice even if not in the corpus, e.g. if somebody inputs 2 similar-looking words next to each other). TODO: check ybytes+ybytes_step*N for all N that yields < nbytes? if nonAnnot==diagnose: diagnose_write("Overriding output nbytes from %d to %d by same-result rule" % (nbytes,ybytes)) nbytes = ybytes else: ret,covered = pRet,pCovered if nonAnnot==diagnose: def report(didFind,negate,ret,covered): if not didFind: return "" if negate: indicators = negate+" indicators " else: indicators = "indicators " if ret: if len(ret) > 30: indicators=str(len(ret))+" "+indicators # +'/'.join(ret[:30]+['...']) else: indicators += '/'.join(ret) else: indicators = "no "+indicators if all(covered): notCovered = "" else: if negate: strs = badStrs else: strs = okStrs notCovered = [strs[i] for i in xrange(len(covered)) if not covered[i]] if len(notCovered) > 10: notCovered = notCovered[:10]+["..."] notCovered = ", not "+'/'.join(notCovered).replace('\n',"\\n") if negate=="overmatch-negative": overmatch=[s for s in okStrs if any(i in s for i in n2Ret)] if len(overmatch) > 10: overmatch = overmatch[:10]+["..."] if overmatch: notCovered += ", overmatch "+"/".join(overmatch).replace('\n',"\\n") return "%s (cover=%d/%d%s)" % (indicators,sum(1 for x in covered if x),len(covered),notCovered) if len(pOmit) > 200: pOmit = pOmit[:200]+"..." diagnose_extra = " ".join(diagnose_extra) if diagnose_extra: diagnose_extra=" "+diagnose_extra rr = ", ".join(r for r in [report(*i) for i in diagnostics] if r) if not rr: rr = "nothing" diagnose_write("tryNBytes(%d) on %s (avoiding '%s') found %s%s" % (nbytes,withAnnot_unistr,pOmit.replace(unichr(1),'/').replace('\n',"\\n"),rr,diagnose_extra)) return negate,ret,sum(1 for x in covered if x),len(covered),nbytes def badInfo(badStarts,nonAnnot,for_tty=True): ret = u"%d false positive" % len(badStarts) if not len(badStarts)==1: ret += "s" if len(badStarts) > yarowsky_debug: return ret for wordStart in badStarts: wordEnd = wordStart + len(nonAnnot) contextStart,contextEnd=max(0,wordStart-5),wordEnd+5 toRead = corpus_markedDown # but can we report it from the original corpus_unistr? if wordStart in m2c_map and wordEnd in m2c_map: toRead = corpus_unistr wordStart,wordEnd = m2c_map[wordStart],m2c_map[wordEnd] newCStart,newCEnd = contextStart,contextEnd while newCStart not in m2c_map and newCStart >= contextStart-5: newCStart-=1 while newCEnd not in m2c_map and newCEnd> %s)" % (toRead[contextStart:wordStart],toRead[wordStart:wordEnd],toRead[wordEnd:contextEnd])).replace("\n","\\n").replace("\r","\\r") return ret def unique_substrings(texts,omitFunc,valueFunc): # yield unique substrings of texts, in increasing length, with equal lengths sorted by highest score returned by valueFunc, and omitting any where omitFunc is true, or that uses any character not in markedUp_unichars (unless yarowsky_all set) if not yarowsky_all: # remove non-allowed chars from texts, splitting into smaller strings as necessary texts2 = [] ; append=texts2.append for text in texts: start = 0 for i in xrange(len(text)): if not text[i] in markedUp_unichars: if i>start: append(text[start:i]) start=i+1 if start len(l2): l1,l2 = l2,l1 return l2[:len(l1)]==l1 or l2[-len(l1):]==l1 class RulesAccumulator: def __init__(self): self.rules = {} # wspJoin(ruleAsWordlist) -> (negate-type, indicator-list, nbytes) or just indicator-list (if empty or negate,nbytes is default) self.rulesAsWordlists_By1stWord = {} # starting word -> list (order unimportant) of possible rules (as wordlists) that might apply, used internally by addRulesForPhrase for faster checks self.rejectedRules = set() self.seenPhrases = set() # de-duplicate, might speed up def addRulesForPhrase(self,phrase,canBackground=False): if phrase in self.seenPhrases or (diagnose_quick and diagnose): # if diagnose and diagnose_quick and mdStart+diagnose+mdEnd in phrase: pass # look at it again for diagnostics. But do we accept a diagnose that spans multiple words? should be pointed out by --diagnose-quick below if uncommented if diagnose and diagnose_quick and diagnose in markDown(phrase): pass # this version accepts diagnose of multiple words (and might also let some phrases through where it matches on an overlap) else: yield 0,0 ; return # TODO: document that this means the total 'covered' figure in the progress status is AFTER phrase de-duplication (otherwise we'd have to look up what the previous values were last time we saw it - no point doing that just for a quick statistic) self.seenPhrases.add(phrase) words = list(filter(lambda x:markDown(x).strip(),splitWords(phrase))) # filter out any that don't have base text (these will be input glitches, TODO: verify the annotation text is also just whitespace, warn if not) if not words: yield 0,0 ; return covered = [False]*len(words) # first see how much is covered by existing rules # (don't have to worry about the order, as we've been # careful about overlaps) for w in set(words): for ruleAsWordlist in self.rulesAsWordlists_By1stWord.get(w,[]): k = wspJoin(ruleAsWordlist) if yarowsky_half_thorough and self.rules[k]: pass # need to make worst-case assumption of context-sensitive rules NEVER matching here, because we need separately to evaluate them depending on each proposed rule from all_possible_rules elif checkCoverage(ruleAsWordlist,words,covered,self.rules[k]) and all(covered): yield len(covered),len(covered) ; return # no new rules needed for ruleAsWordlist, wStart in all_possible_rules(words,covered): rule = wspJoin(ruleAsWordlist) ; yBytesRet = [] if rule in self.rejectedRules: continue if rule in self.rules: continue # this can still happen even now all_possible_rules takes 'covered' into account, because the above checkCoverage assumes the rule won't be applied in a self-overlapping fashion, whereas all_possible_rules makes no such assumption (TODO: fix this inconsistency?) if yarowsky_half_thorough and len(ruleAsWordlist) > 1: c2 = covered[wStart:wStart+len(ruleAsWordlist)] def f(): for w in set(ruleAsWordlist): for r2 in self.rulesAsWordlists_By1stWord.get(w,[]): # if len(r2) >= len(c2): continue # as an optimisation, but this shouldn't be necessary if we're doing phrases in increasing length # if not yarowsky_multiword and len(r2)>1: continue # but self.rules[k] will be False below anyway, so this won't be much of a speedup, just saves one wspJoin k = wspJoin(r2) if self.rules[k] and checkCoverage(r2,ruleAsWordlist,c2,self.rules[k]) and all(c2): return f() if all(c2): continue rGen = test_rule(rule,yBytesRet,canBackground) r = getNext(rGen) if r=="backgrounded": yield r ; yield getNext(rGen) r = getNext(rGen) del rGen if not r or potentially_bad_overlap(self.rulesAsWordlists_By1stWord,ruleAsWordlist): self.rejectedRules.add(rule) # so we don't waste time evaluating it again (TODO: make sure rejectedRules doesn't get too big?) continue if len(yBytesRet): self.rules[rule] = yBytesRet[0] else: self.rules[rule] = [] # unconditional if yarowsky_half_thorough: k = [] else: k = self.rules[rule] checkCoverage(ruleAsWordlist,words,covered,k) # changes 'covered' if not ruleAsWordlist[0] in self.rulesAsWordlists_By1stWord: self.rulesAsWordlists_By1stWord[ruleAsWordlist[0]] = [] self.rulesAsWordlists_By1stWord[ruleAsWordlist[0]].append(ruleAsWordlist) handle_diagnose_limit(rule) if all(covered): yield len(covered),len(covered) ; return # If get here, failed to completely cover the phrase. # ruleAsWordlist should be set to the whole-phrase rule. yield sum(1 for x in covered if x),len(covered) def saveRules(rulesAndConds): sys.stderr.write("\nSaving rules to %s... " % rulesFile) ; sys.stderr.flush() f = openfile(rulesFile,'w') d = [] # rulesAndConds is already a sorted list for k,v in rulesAndConds: if not v: d.append(k) elif type(v)==tuple: d.append((k,(v[0],sorted(v[1]),v[2]))) else: d.append((k,sorted(v))) json.dump(d,codecs.getwriter("utf-8")(f),indent=4,ensure_ascii=False) f.close() ; sys.stderr.write("done") sys.stderr.flush() def loadRules(): sys.stderr.write("Loading rules from %s... " % rulesFile) ; sys.stderr.flush() f = openfile(rulesFile) rulesAndConds = [] for item in json.load(codecs.getreader("utf-8")(f)): if type(item)==list: k,v = item if len(v)==3 and type(v[1])==list: v=tuple(v) rulesAndConds.append((k,v)) else: rulesAndConds.append((item,[])) sys.stderr.write("done\n") return rulesAndConds def handle_diagnose_limit(rule): global diagnose,diagnose_limit if diagnose and diagnose_limit and diagnose==markDown(rule): diagnose_limit -= 1 if not diagnose_limit: diagnose = False diagnose_write("limit reached, suppressing further diagnostics") def generate_map(): global m2c_map, precalc_sets, yPriorityDic sys.stderr.write("Generating corpus map... ") m2c_map = {} ; precalc_sets = {} muStart = downLenSoFar = 0 for s in re.finditer(re.escape(markupStart), corpus_unistr): s=s.start() md = markDown(corpus_unistr[muStart:s]) if markupStart in md: errExit("examples have nested markup! "+repr(md)) downLenSoFar += len(md) muStart = s m2c_map[downLenSoFar] = s # Added optimisation: do precalc_sets as well # (at least catch the 1-word cases) # -> this is now needed even if not ybytes e=corpus_unistr.find(markupEnd,s) if e>-1: e += len(markupEnd) k = corpus_unistr[s:e] if k not in precalc_sets: precalc_sets[k]=set() precalc_sets[k].add(downLenSoFar) yPriorityDic = {} if end_pri and ybytes: sys.stderr.write("yPriorityDic ... ") for w in splitWords(corpus_unistr[:corpus_unistr.index(end_pri)]): # (throws error if --end-pri is not in the corpus) wd = markDown(w) if wd in yPriorityDic: continue if diagnose==wd: diagnose_write("yPriorityDic[%s] = %s" % (wd,w)) yPriorityDic[wd] = w sys.stderr.write("done\n") executor = None def setup_parallelism(): # returns number of cores global executor if single_core: return 1 elif not hasattr(os,'fork'): return 1 # e.g. Windows, would need to write to filesystem like versions of annogen before 3.183 and distinguish main/non-main like versions of annogen before 3.38 (may get muddled up if running from __main__.py under python -m) elif executor: executor.shutdown(True) # MUST wait for the shutdown to finish before creating a new instance: some implementations seem to have a race condition try: import multiprocessing params = [multiprocessing.cpu_count()-1] if params[0] <= 0: return 1 if hasattr(multiprocessing,"get_context"): params.append(multiprocessing.get_context('fork')) # Python 3.4+: if this raises ValueError, we can't fork so won't multiprocess import concurrent.futures # Python 3. On Python 2, you can try the backport via 'pip install futures' but deadlock is possible. And seems to occur when annogen __name__ is not "__main__" after first normBatch returns, not sure why: if not (type("")==type(u"") or __name__=="__main__"): raise Exception("Python 2 as submodule is known to deadlock unless multicore disabled") executor = concurrent.futures.ProcessPoolExecutor(*params) # Do not reduce Python 2's sys.setcheckinterval() (or Python 3's setswitchinterval) if using ProcessPoolExecutor, or job starts can be delayed. cores = multiprocessing.cpu_count() if cores_command: os.system("%s %d" % (cores_command,cores)) return cores except: return 1 # can't fork for some reason def get_phrases(): # Returns a list of phrases in processing order, with length-numbers inserted in the list. Caches its result. global _gp_cache try: return _gp_cache except: pass # Due to the way we handle overlaps, it's better to process the shortest phrases first, as the longer phrases will yield more rule options and therefore more likely to be able to work around any "no-overlap" constraints imposed by already-processed examples. Something like: p2 = [] for p in splitWords(corpus_unistr,phrases=True): p2.append((min([len(p.split(markupStart)),len(p.split(markupMid)),len(p.split(markupEnd))]),len(p2),p)) # no need for splitWords(phrase) just to get len, but we do need the min-of-3 for robustness against the occasional markup error p2.sort() # by length, then by original position (note: if removing this sort, remove wordsThisPhrase from status_update) phrases = [] ; wordLen = None for p in p2: if not wordLen == p[0]: wordLen = p[0] phrases.append(wordLen-1) # because it's a .split length (really want an actual count, but it only has to be roughly right in this instance and splitLen-1 will do for speed) phrases.append(p[-1]) _gp_cache = phrases ; return phrases def setup_other_globals(): global corpus_markedDown, bigramCache corpus_markedDown = markDown(corpus_unistr) if not ybytes: return bigramCache=dict((i,[]) for i in set(corpus_markedDown[i:i+2] for i in xrange(len(corpus_markedDown)-1))) for i in xrange(len(corpus_markedDown)-1): k=corpus_markedDown[i:i+2] if k in bigramCache: bigramCache[k].append(i) if len(bigramCache[k]) > 100: del bigramCache[k] if freq_count or not yarowsky_all: global markedUp_unichars markedUp_unichars = set(list(u"".join(markDown(p) for p in get_phrases() if not type(p)==int))) def analyse(): accum = RulesAccumulator() covered = 0 # number of phrases we managed to 'cover' with our rules toCover = 0 # number of phrases we TRIED to cover (==covered if 100%) phraseNo = 0 ; wordLen = None phraseLastUpdate = phraseNo lastUpdate = startTime = time.time() backgrounded = [] ; phrases = get_phrases() while phraseNo < len(phrases): if type(phrases[phraseNo])==int: oldWL,wordLen = wordLen,phrases[phraseNo] if wordLen > 1 and oldWL == 1: # we currently use background jobs only for wordLen=1; other workers won't be needed after these jobs finish try: executor.shutdown(False) except: pass covered,toCover = flush_background(backgrounded," for #w change",covered,toCover) if wordLen > 1 and oldWL == 1 and cores_command: os.system(cores_command+" 1") phraseNo += 1 ; continue if time.time() >= lastUpdate + 2: if toCover: cov=int(100.0*covered/toCover) else: cov = 0 status_update(phraseNo,len(phrases),wordLen,len(accum.rules),phraseLastUpdate,lastUpdate,startTime,cov,len(accum.rejectedRules)) lastUpdate = time.time() ; phraseLastUpdate = phraseNo aRules = accum.addRulesForPhrase(phrases[phraseNo],wordLen==1) # We're saying canBackground only if wordLen==1 because longer phrases can be backgrounded only if they're guaranteed not to have mutual effects. Could look into when we can do that (or a separate pass through adding all len-1 rules 1st) and remove the executor.shutdown above, but test corpus is showing NO large collocation checks needed at #w=2+ anyway, so this work would not actually save generation time. arr = getNext(aRules) if arr=="backgrounded": backgrounded.append(aRules) else: coveredA,toCoverA = arr covered += coveredA ; toCover += toCoverA phraseNo += 1 flush_background(backgrounded) if wordLen==1: if cores_command: os.system(cores_command+" 1") try: executor.shutdown(False) # if wordLen never exceeded 1 so it didn't get shut down above, might as well free up other processes now except: pass if diagnose_manual: test_manual_rules() rules = sorted(accum.rules.items()) # sorting it makes the order stable across Python implementations and insertion histories: useful for diff when using concurrency etc (can affect order of otherwise-equal Yarowsky-like comparisons in the generated code) if write_rules: saveRules(rules), sys.exit(0) else: return rules try: import Queue as queue # Python 2 except: import queue # Python 3 def flush_background(backgrounded,why="",covered=0,toCover=0): q = queue.Queue() origLen = len(backgrounded) if origLen: sys.stderr.write("Collecting backgrounded results%s: 0/%d%s" % (why,origLen,clear_eol)) while backgrounded: b = backgrounded.pop() getNext(b).add_done_callback(lambda _,b=b:q.put(b)) for count in xrange(origLen): coveredA,toCoverA = getNext(q.get()) covered += coveredA ; toCover += toCoverA sys.stderr.write("\rCollecting backgrounded results%s: %d/%d" % (why,count+1,origLen)) if origLen: sys.stderr.write("\n") return covered,toCover def read_manual_rules(): if not manualrules: return for l in openfile(manualrules): if not l.strip(): continue l=l.decode(incode).strip() # TODO: manualrulescode ? if removeSpace: l=re.sub(re.escape(markupEnd)+r'\s+'+re.escape(markupStart),(markupEnd+markupStart).replace('\\',r'\\'),l,flags=re.UNICODE) yield l def test_manual_rules(): for l in read_manual_rules(): words = list(splitWords(l)) # Prevent KeyError in getOkStarts: for w in words: if w not in precalc_sets: precalc_sets[w]=set() # Call test_rule: yb = [] if not getNext(test_rule(l,yb)) or len(yb): getBuf(sys.stderr).write(("\nWARNING: Manual rule '%s' may contradict the examples. " % l).encode(terminal_charset)) if len(words)==1: global diagnose,diagnose_limit,ybytes od,odl,oy,diagnose,diagnose_limit,ybytes = diagnose,diagnose_limit,ybytes,markDown(l),0,ybytes_max getNext(test_rule(l,[])) diagnose,diagnose_limit,ybytes = od,odl,oy def zapTrigraphs(x): return re.sub(br"\?\?([=/'()<>!-])",br'?""?\1',x) # to get rid of trigraph warnings, TODO might get a marginal efficiency increase if do it to the entire C file at once instead) def c_escapeRawBytes(s): # as it won't be valid outcode; don't want to crash any editors/viewers of the C file if s.endswith(b'\x00'): s=s[:-1] # as the C compiler will add a terminating 0 anyway return re.sub(br"(?len(l) and not l2[-1]: del l2[-1] # don't mind extra blank line(s) at end of output if not len(l)==len(l2): open('reannotator-debug-in.txt','wb').write(os.linesep.join(l).encode(outcode)+B(os.linesep)) open('reannotator-debug-out.txt','wb').write(os.linesep.join(l2).encode(outcode)+B(os.linesep)) errExit("Reannotator command didn't output the same number of lines as we gave it (gave %d, got %d). Input and output have been written to reannotator-debug-in.txt and reannotator-debug-out.txt for inspection. Bailing out." % (len(l),len(l2))) if stderr_newline: sys.stderr.write("reannotated %d items\n" % len(l)) else: sys.stderr.write("(%d items)\n" % len(l)) toReannotateSet = set() ; reannotateDict = dict(zip(l,l2)) ; del l,l2 if compress: global squashStrings ; squashStrings = set() # discard any that were made in any reannotator dry-run dryRun(False) # redo with the new annotation strings (or do for the first time if no reannotator) pairs = squashFinish() else: pairs = b"" for rule,conds in rulesAndConds: addRule(rule,conds,byteSeq_to_action_dict) for l in read_manual_rules(): if diagnose_manual and l in rulesAndConds: getBuf(sys.stderr).write(("\nINFO: Possible unnecessary manual rule '%s'\n" % l).encode(terminal_charset)) addRule(l,[],byteSeq_to_action_dict,True) longest_rule_len = max(len(b) for b in iterkeys(byteSeq_to_action_dict)) longest_rule_len += ybytes_max # because buffer len is 2*longest_rule_len, we shift half of it when (readPtr-bufStart +ybytes >= bufLen) and we don't want this shift to happen when writePtr-bufStart = Half_Bufsize-1 and readPtr = writePtr + Half_Bufsize-1 (TODO: could we get away with max(0,ybytes_max-1) instead? but check how this interacts with the line below; things should be safe as they are now). This line's correction was missing in Annogen v0.599 and below, which could therefore occasionally emit code that, when running from stdin, occasionally replaced one of the document's bytes with an undefined byte (usually 0) while emitting correct annotation for the original byte. (This could result in bad UTF-8 that crashed the bookmarklet feature of Web Adjuster v0.21 and below.) longest_rule_len = max(ybytes_max*2, longest_rule_len) # make sure the half-bufsize is at least ybytes_max*2, so that a read-ahead when pos is ybytes_max from the end, resulting in a shift back to the 1st half of the buffer, will still leave ybytes_max from the beginning, so yar() can look ybytes_max-wide in both directions b = BytecodeAssembler() b.addActionDictSwitch(byteSeq_to_action_dict,False) ddrivn = b.link() if zlib: origLen = b.origLen del b if javascript: if browser_extension: return outfile.write(txt_escapeRawBytes(ddrivn)) else: return outfile.write(js_start+b"data: \""+js_escapeRawBytes(ddrivn)+b"\",\n"+js_end+b"\n") # not Uint8Array (even if browser compatibility is known): besides taking more source space, it's typically ~25% slower to load than string, even from RAM elif dart: if dart_datafile: if os.sep in c_filename: d=c_filename[:c_filename.rindex(os.sep)]+os.sep else: d = "" if os.sep in dart_datafile: d += dart_datafile[dart_datafile.rindex(os.sep)+1:] else: d += dart_datafile open(d,'wb').write(ddrivn) sys.stderr.write("Wrote "+d+" (ensure this ships as "+dart_datafile+")\n") if dart_datafile and zlib: return outfile.write(dart_src.replace(b"%%DATA_INIT%%",b"await(File('"+B(dart_datafile)+b"').readAsBytes())")) elif zlib: return outfile.write(dart_src.replace(b"%%DATA_INIT%%",b"\""+dart_escapeRawBytes(ddrivn)+b"\".codeUnits")) elif dart_datafile: return outfile.write(dart_src.replace(b"%%DATA_INIT%%",b"String.fromCharCodes(await(File('"+B(dart_datafile)+b"').readAsBytes()))")) else: return outfile.write(dart_src.replace(b"%%DATA_INIT%%",b"\""+B(dart_escapeRawBytes(ddrivn))+b"\"")) elif python: dd2 = repr(ddrivn) if not dd2.startswith('b'): dd2='b'+dd2 # (if we're generating in Python 2, we still want 2+3 compatibility) outfile.write(py_start+b"\ndata="+B(dd2)+b"\n") if zlib: outfile.write(b"import zlib; data=zlib.decompress(data)\n") return outfile.write(py_end+b"\n") elif java: start = java_src.replace(b"%%JPACKAGE%%",B(jPackage)) start = start.replace(b"%%DLEN%%",B(str(len(ddrivn)))) if zlib: start = start.replace(b"%%ULEN%%",B(str(origLen))) else: start = c_start outfile.write(start.replace(b'%%LONGEST_RULE_LEN%%',B(str(longest_rule_len))).replace(b"%%YBYTES%%",B(str(ybytes_max))).replace(b"%%PAIRS%%",pairs)+b"\n") if zlib: dataName = "origData" else: dataName = "data" if java: open(jSrc+("/../assets/annotate.dat" if android else "/annotate.dat"),"wb").write(ddrivn) else: outfile.write(b"static unsigned char "+B(dataName)+b"[]=\""+c_escapeRawBytes(ddrivn)+b'\";\n') if zlib: outfile.write(c_zlib.replace(b'%%ORIGLEN%%',B(str(origLen))).replace(b'%%ZLIBLEN%%',B(str(len(ddrivn))))+b"\n") # rather than using sizeof() because we might or might not want to include the compiler's terminating nul byte outfile.write(c_datadrive+b"\n") del ddrivn if android: open(java+os.sep+"MainActivity.java","wb").write(android_src.replace(b"%%JPACKAGE%%",B(jPackage)).replace(b'%%ANDROID-URL%%',B(android))) open(java+os.sep+"BringToFront.java","wb").write(android_bringToFront.replace(b"%%JPACKAGE%%",B(jPackage))) open(jSrc+"/../assets/clipboard.html",'wb').write(android_clipboard) if android_template: aStamp = android_version_stamp try: versionName = re.findall(B(re.escape("versionName")+r'\s*=\s*"([^"]*)"'),open(jSrc+"/../AndroidManifest.xml",'rb').read())[0] except: versionName = None if versionName: aStamp = aStamp.replace(b"%%DATE%% version",b"%%DATE%% version "+versionName) open(jSrc+"/../assets/index.html",'wb').write(android_template.replace(b"VERSION_GOES_HERE",aStamp.replace(b"%%DATE%%",b"%d-%02d-%02d" % time.localtime()[:3]).replace(b"%%TIME%%",b"%d:%02d" % time.localtime()[3:5]))) update_android_manifest() open(jSrc+"/../res/layout/activity_main.xml","wb").write(android_layout) open(jSrc+"/../res/menu/main.xml","wb").write(b'\n') # TODO: is this file even needed at all? open(jSrc+"/../res/values/dimens.xml","wb").write(b'16dp16dp\n') open(jSrc+"/../res/values/styles.xml","wb").write(b'\n') # won't compile on SDKs that don't know about API 29, e.g. Ubuntu 22.04's packages: could try using introspection to call setForceDarkAllowed() (but need to solve the 'missing d8' problem first if want to upload the resulting APK) open(jSrc+"/../res/values/strings.xml","wb").write(B('\n'+app_name.replace('&','&').replace('<','<').replace('>','>')+'\n')) open(jSrc+"/../res/xml/network_security_config.xml","wb").write(b'\n

\n') elif not java: outfile.write(c_end) outfile.write(b"\n") del byteSeq_to_action_dict def update_android_manifest(): try: manifest = old_manifest = open(jSrc+"/../AndroidManifest.xml",'rb').read() except IOError: manifest,old_manifest = android_manifest,None def readAttr(aName): allVals = re.findall(B(re.escape(aName)+r'\s*=\s*"([^"]*)"'),manifest) assert len(allVals)==1, "AndroidManifest.xml has %d instances of %s, should be 1" % (len(allVals),aName) return allVals[0] versionCode,versionName = readAttr("android:versionCode"),readAttr("android:versionName") if b"android:sharedUserId" in manifest: sharedUID = readAttr("android:sharedUserId") else: sharedUID = b"" if android_upload: sys.stderr.write("AndroidManifest.xml: bumping versionCode for upload\n (assuming you've taken care of versionName separately, if needed)\n") # (might not be needed if the previous upload wasn't actually released for example) versionCode = B(str(int(versionCode)+1)) manifest = android_manifest.replace(b'%%JPACKAGE%%',B(jPackage)).replace(b'android:versionCode="1"',b'android:versionCode="'+versionCode+b'"').replace(b'android:versionName="1.0"',b'android:versionName="'+versionName+b'"').replace(b'android:sharedUserId=""',b'android:sharedUserId="'+sharedUID+b'"').replace(b'android:sharedUserId="" ',b'') if not manifest==old_manifest: open(jSrc+"/../AndroidManifest.xml","wb").write(manifest) else: assert not android_upload, "Couldn't bump version code in "+repr(manifest) def setup_browser_extension(): dirToUse = browser_extension.replace(' ','') sys.stderr.write("Writing to "+dirToUse+"\n") try: os.mkdir(dirToUse) except: pass def icons(key,sizes): if any(os.path.isfile(dirToUse+os.sep+s+".png") for s in sizes): return b',"'+B(key)+b'":{'+b",".join(B('"%s":"%s.png"' % (s,s)) for s in sizes if os.path.isfile(dirToUse+os.sep+s+".png"))+b"}" else: return b"" try: # increment existing version if present versionName = re.search(b'"version": *"([^"]*)"',open(dirToUse+"/manifest.json","rb").read()).group(1) versionName = versionName.split(b'.') versionName[-1] = B(str(int(versionName[-1])+1)) versionName = b'.'.join(versionName) except: versionName = b"0.1" open(dirToUse+"/manifest.json","wb").write((br"""{ "manifest_version": """+(b"3" if manifest_v3 else b"2")+br""", "name": "%s",%s "version": "%s", "browser_specific_settings": { "id" :"%s", "gecko_android": {}}, "background": { """+(b'"service_worker": "background.js"' if manifest_v3 else b'"scripts": ["background.js"]')+br""" }, "content_scripts": [{"matches": [""], "js": ["content.js"], "css": ["ruby.css"]}], """+(b'"action"' if manifest_v3 else b'"browser_action"')+br""":{"default_title":"Annotate","default_popup":"config.html","browser_style": true%s}, """+(b'"host_permissions": [""], "permissions": ["clipboardRead","storage","scripting"]' if manifest_v3 else b'"permissions": ["","clipboardRead"]')+b"%s}") % (B(browser_extension),B((('" description": "%s",'%browser_extension_description) if browser_extension_description else "")),versionName,B(gecko_id),icons("default_icon",["16","32"]),icons("icons",["16","32","48","96"]))) open(dirToUse+"/background.js","wb").write(js_start+js_end) open(dirToUse+"/content.js","wb").write(jsAnnot(False,True)) open(dirToUse+"/config.html","wb").write(extension_config) open(dirToUse+"/config.js","wb").write(extension_confjs) open(dirToUse+"/ruby.css","wb").write(extension_rubycss) global c_filename c_filename = dirToUse+"/annotate-dat.txt" if isatty(sys.stdout) and not java and not priority_list and not browser_extension and not write_rules: sys.stderr.write("Will write to "+c_filename+"\n") # will open it later (avoid having a 0-length file sitting around during the analyse() run so you don't rm it by mistake) def openfile(fname,mode='r'): lzma = bz2 = None mode += 'b' # Python 2+3 compatibility: always binary if fname.endswith(".xz"): import lzma # 'pip install lzma' or 'apt-get install python2.7-lzma' may be required for .xz files elif fname.endswith(".bz2"): import bz2 if re.match("https?://",fname) or fname.startswith("ftp://"): assert mode=='rb', "cannot write to "+fname try: from urllib2 import urlopen # Python 2 except: from urllib.request import urlopen # Py3 sys.stderr.write("Fetching "+fname+"\n") fileobj = urlopen(fname) # If it's bz2 or xz, we'd better decompress in one operation. (gz library can stream) if fname.endswith(".bz2"): from cStringIO import StringIO return StringIO(bz2.decompress(fileobj.read())) elif fname.endswith(".xz"): from cStringIO import StringIO return StringIO(lzma.decompress(fileobj.read())) elif fname.endswith(".bz2"): return bz2.BZ2File(fname,mode) elif fname.endswith(".xz"): return lzma.LZMAFile(fname,mode) else: fileobj = open(fname,mode) # if get this far, we can use fileobj if fname.endswith(".gz"): import gzip ; return gzip.GzipFile(fileobj=fileobj,mode=mode) else: return fileobj def rm_f(fname): try: os.remove(fname) except OSError: pass import atexit def set_title(t): if t: try: from setproctitle import setproctitle ; setproctitle(t) except: pass # TODO: could also try others from adjuster if not isatty(sys.stderr): return if t: atexit.register(set_title,"") is_screen = (term=="screen" and os.environ.get("STY","")) is_tmux = (term=="screen" and os.environ.get("TMUX","")) if is_xterm or is_tmux: sys.stderr.write("\033]0;%s\007" % (t,)) # ("0;" sets both title and minimised title, "1;" sets minimised title, "2;" sets title. Tmux takes its pane title from title (but doesn't display it in the titlebar)) elif is_screen: os.system("screen -X title \"%s\"" % (t,)) def diagnose_write(s,label="Diagnose"): getBuf(sys.stderr).write(B(bold_on+label+": "+bold_off)+s.encode(terminal_charset,'replace')+B(clear_eol+'\n')) try: screenWidth = int(os.environ['COLUMNS']) except: import struct, fcntl, termios try: screenWidth = struct.unpack('hh',fcntl.ioctl(sys.stderr,termios.TIOCGWINSZ,'xxxx'))[1] except: screenWidth = 45 # conservative if not compile_only: set_title("annogen") if read_rules: rulesAndConds = loadRules() else: read_and_normalise() if priority_list: if os.path.exists(priority_list): sys.stderr.write("Reading "+priority_list+"\n") def getPri(line): word,pri = line.decode(outcode).rstrip().rsplit(None,1) try: return word,int(pri) except: return word,float(pri) existingPris=dict(getPri(l) for l in openfile(priority_list) if len(l.strip().split())>=2) else: existingPris = {} sys.stderr.write("Parsing...") ; sys.stderr.flush() i=[[markDown(w) for w in splitWords(phrase)] for phrase in splitWords(corpus_unistr,phrases=True)] del corpus_unistr sys.stderr.write(" calling PairPriorities...\n") out="".join(w+"\t"+str(f)+os.linesep for w,f in PairPriorities(i,existingPris) if f).encode(outcode) # (don't open the output before here, in case exception) if existingPris: sys.stderr.write("Updating "+priority_list+"...") else: sys.stderr.write("Writing "+priority_list+"...") sys.stderr.flush() openfile(priority_list,'w').write(out) sys.stderr.write(" done\n") sys.exit() generate_map() ; setup_other_globals() if freq_count: sys.stderr.write("Writing "+freq_count+"...") ; sys.stderr.flush() counts = {} for c in post_normalise_translate(corpus_unistr): if c in markedUp_unichars and c.strip(): if not c in counts: counts[c] = 0 counts[c] += 1 cSets = {} for k,v in iteritems(post_normalise): if not unichr(v) in cSets: cSets[unichr(v)]=unichr(v) cSets[unichr(v)] += unichr(k) openfile(freq_count,'w').write((u"\n".join(cSets.get(c,c) for _,c in list(reversed(sorted((n,c) for c,n in iteritems(counts))))[:1000])+u"\n").encode('utf-8')) # TODO: customise the 1000 (maybe greater increment after 1000) sys.stderr.write(" done\n") setup_parallelism() # re-copy globals to cores try: rulesAndConds = analyse() finally: sys.stderr.write("\n") # so status line is not overwritten by 1st part of traceback on interrupt etc del _gp_cache def cmd_or_exit(cmd): sys.stderr.write(cmd+"\n") r = os.system(cmd) if not r: return if r&0xFF == 0: r >>= 8 # POSIX sys.exit(r) if not compile_only: if browser_extension: setup_browser_extension() if c_filename: outfile = openfile(c_filename,'w') else: outfile = getBuf(sys.stdout) outputParser(rulesAndConds) ; del rulesAndConds outfile.close() ; sys.stderr.write("Output complete\n") if android: can_compile_android = all(x in os.environ for x in ["SDK","PLATFORM","BUILD_TOOLS"]) can_track_android = (can_compile_android and android_upload) or ("GOOGLE_PLAY_TRACK" in os.environ and "SERVICE_ACCOUNT_KEY" in os.environ and not os.environ.get("ANDROID_NO_RETRACK","")) if can_compile_android and compile_only and android_upload: update_android_manifest() # AndroidManifest.xml will not have been updated, so we'd better do it now if can_compile_android or can_track_android: os.chdir(jSrc+"/..") dirName0 = S(getoutput("pwd|sed -e s,.*./,,")) dirName = shell_escape(dirName0) if can_compile_android: # TODO: use aapt2 and figure out how to make a 'bundle' with it so Play Store can accept new apps after August 2021 ? (which requires giving them your signing keys, and I don't see the point in enforcing the 'bundle' format for a less than 1k saving due to not having to package multiple launcher icons on each device, and you'd probably have to compile non-Store apks separately.) Don't know if/when updates to pre-Aug2021 apps will be required to be in Bundle format. cmd_or_exit("$BUILD_TOOLS/aapt package -0 '' -v -f -I $PLATFORM/android.jar -M AndroidManifest.xml -A assets -S res -m -J gen -F bin/resources.ap_") # (the -0 '' (no compression) is required if targetSdkVersion=30 or above, and shouldn't make much size difference on earlier versions as annotate.dat is itself compressed) cmd_or_exit("find src/"+jRest+" -type f -name '*.java' > argfile && javac -Xlint:deprecation -classpath $PLATFORM/android.jar -sourcepath 'src;gen' -d bin gen/"+jRest+"/R.java @argfile && rm argfile") # as *.java likely too long (-type f needed though, in case any *.java files are locked for editing in emacs) if os.path.exists(os.environ["BUILD_TOOLS"]+"/dx"): # older SDK a = " -JXmx4g --force-jumbo" # -J option must go first if "min-sdk-version" in getoutput("$BUILD_TOOLS/dx --help"): a += " --min-sdk-version=1" # older versions of dx don't have that flag, but will be min-sdk=1 anyway cmd_or_exit("$BUILD_TOOLS/dx"+a+" --dex --output=bin/classes.dex bin/") else: cmd_or_exit("$BUILD_TOOLS/d8 --min-api 1 --output bin $(find bin -type f -name '*.class')") cmd_or_exit("cp bin/resources.ap_ bin/"+dirName+".ap_") cmd_or_exit("cd bin && $BUILD_TOOLS/aapt add -0 '' "+dirName+".ap_ classes.dex") cmd_or_exit("rm -f bin/"+dirName0+".apk && $BUILD_TOOLS/zipalign 4 bin/"+dirName+".ap_ bin/"+dirName+".apk && rm -f ../"+dirName0+".apk") if all(x in os.environ for x in ["KEYSTORE_FILE","KEYSTORE_USER","KEYSTORE_PASS"]): cmd_or_exit("$BUILD_TOOLS/apksigner sign --ks $KEYSTORE_FILE --v1-signer-name $KEYSTORE_USER --ks-pass env:KEYSTORE_PASS --key-pass env:KEYSTORE_PASS --out ../"+dirName+".apk bin/"+dirName+".apk") cmd_or_exit("rm -f bin/"+dirName0+".ap_ bin/"+dirName0+".apk") if not can_track_android: cmd_or_exit("du -h ../"+dirName+".apk") if can_track_android: import httplib2,googleapiclient.discovery,oauth2client.service_account # pip install google-api-python-client (or pip install --upgrade google-api-python-client if yours is too old). Might need pip install oauth2client also. trackToUse = os.environ.get("GOOGLE_PLAY_TRACK","").strip() if not trackToUse: trackToUse='beta' for tryNo in xrange(10): try: if tryNo: sys.stderr.write("\nRetrying in 1 minute... ") sys.stderr.flush() ; time.sleep(60) sys.stderr.write("now\n") sys.stderr.write("Logging in... ") sys.stderr.flush() service = googleapiclient.discovery.build('androidpublisher', 'v3', http=oauth2client.service_account.ServiceAccountCredentials.from_json_keyfile_name(os.environ['SERVICE_ACCOUNT_KEY'],'https://www.googleapis.com/auth/androidpublisher').authorize(httplib2.Http())) eId = service.edits().insert(body={},packageName=jPackage).execute()['id'] if android_upload: sys.stderr.write("uploading... ") sys.stderr.flush() v = service.edits().apks().upload(editId=eId,packageName=jPackage,media_body="../"+dirName+".apk").execute()['versionCode'] sys.stderr.write("\rUploaded "+dirName+".apk (version code "+str(v)+")\n") open(jSrc+"/../.last-versionCode","w").write(str(v)) else: v = int(open(jSrc+"/../.last-versionCode").read().strip()) # if this fails, you probably didn't run annogen v0.691+ to compile the APK before trying to change track (see instructions printed when GOOGLE_PLAY_TRACK environment variable is not set) if os.environ.get("GOOGLE_PLAY_CHANGELOG",""): service.edits().tracks().update(editId=eId,track=trackToUse,packageName=jPackage,body={u'releases':[{u'versionCodes':[v],u"releaseNotes":[{u"language":u"en-US",u"text":T(os.environ["GOOGLE_PLAY_CHANGELOG"])}],u'status':u'completed'}],u'track':trackToUse}).execute() # needs to be "en-US" as just "en" is dropped by the Store, although it does say you can "add as supported language in your app's Store Listing" else: service.edits().tracks().update(editId=eId,track=trackToUse,packageName=jPackage,body={u'releases':[{u'versionCodes':[v],u'status':u'completed'}],u'track':trackToUse}).execute() if not android_upload: sys.stderr.write("Warning: GOOGLE_PLAY_CHANGELOG not set, any release notes will be deleted\n") sys.stderr.write("Committing... ") sys.stderr.flush() sys.stderr.write("\rCommitted edit %s: %s.apk v%s to %s\n" % (service.edits().commit(editId=eId,packageName=jPackage).execute()['id'],dirName,v,trackToUse)) break except httplib2.HttpLib2Error: pass if not can_compile_android and not can_track_android: sys.stderr.write("Android source has been written to "+jSrc[:-3]+""" To have Annogen build it for you, set these environment variables before the Annogen run (change the examples obviously) : export SDK=/home/example/Android/Sdk export PLATFORM=$SDK/platforms/android-33 export BUILD_TOOLS=$SDK/build-tools/33.0.1 # To sign the build (required for release), additionally set: export KEYSTORE_FILE=/path/to/keystore export KEYSTORE_USER='your user name' export KEYSTORE_PASS='your password' # The app will be compatible with Android 1.0+ # but SDK 24+ is required on the build machine. # SDK 24 was released mid-2016. If you have an older Intel-based machine whose main OS cannot be upgraded, you may be able to install a newer SDK on a virtual machine, e.g. on a 2011 Mac stuck on MacOS 10.7, I used VirtualBox 4.3.4, Vagrant 1.9.5, Debian 8 Jessie and SSH with X11 forwarding to install Android Studio 3.5 from 2019, although for apksigner to work I also had to add 'deb http://archive.debian.org/debian/ jessie-backports main' to /etc/apt/sources.list and do 'sudo apt-get -o Acquire::Check-Valid-Until=false update' and 'sudo apt-get install -t jessie-backports openjdk-8-jdk openjdk-8-jre openjdk-8-jre-headless ca-certificates-java' and 'sudo apt-get --purge remove openjdk-7-jre-headless' # On non-Intel architectures, I suggest installing box64 (and box86 is also useful for 32-bit binaries), and installing the x86 Android SDK (you'll need an X11 connection to it to download and install Android Studio). Ubuntu 22.04's multi-architecture android-sdk-build-tools package is not suitable (even together with google-android-platform-24-installer) because it doesn't provide the necessary update to app signing in SDK 24. # You can upload the apk to Google Play to update an existing app. # Since August 2021, Google Play enforces a different 'bundle' format # for new apps, which I don't yet know how to make. # To upload the update release to Google Play, additionally set: export SERVICE_ACCOUNT_KEY=/path/to/api-*.json # (must be an absolute path) # and optionally: export GOOGLE_PLAY_CHANGELOG="Updated annotator" export GOOGLE_PLAY_TRACK=alpha # default beta (please don't put production); however sending yourself the APK file is usually faster than using the alpha track if it's just to test on your own devices # If the above variables including SERVICE_ACCOUNT_KEY are set (and you haven't set ANDROID_NO_UPLOAD, below), then you'll also get an openPlayStore() function added to the Javascript interface for use in 'check for updates' links. # After testing, you can change the track of an existing APK by setting ANDROID_NO_UPLOAD=1 but still setting SERVICE_ACCOUNT_KEY and GOOGLE_PLAY_TRACK (and not ANDROID_NO_RETRACK), and run with --compile-only. You will need to set GOOGLE_PLAY_CHANGELOG again when doing this, as the Google API now discards changelogs on track-changes unless they are re-specified. You may also wish to create some icons in res/drawable* (using Android Studio or the earlier ADT tools). On Google Play you may wish to set Release management - Pre-launch report - Settings - Enable pre-launch reports to OFF, or it'll report issues on the websites you link to (and maybe crashes due to Firebase issues), which (if you don't want them) is wasting resources. """) # TODO: try if("true".equals(android.provider.Settings.System.getString(getContentResolver(),"firebase.test.lab"))) browser.loadUrl("about:blank"); (but turning off unwanted reports is better) elif c_filename and c_compiler: cmd = c_compiler # should include any -o option if zlib: cmd += " -lz" # TODO: is this always correct on all platforms? (although user can always simply redirect the C to a file and compile separately) cmd_or_exit(cmd + " " + shell_escape(c_filename)) elif compile_only: errExit("Don't know what compiler to run for this set of options")