Initial commit

2020-07-17 04:26:54 -04:00 · 2020-07-17 04:26:54 -04:00 · 167d208685
commit 167d208685
4 changed files with 275 additions and 0 deletions
--- a/19
+++ b/19
@ -0,0 +1,19 @@
 MIT License Copyright (c) 2020 xenofem <xenofematxenodotscience>
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is furnished
 to do so, subject to the following conditions:
 The above copyright notice and this permission notice (including the next
 paragraph) shall be included in all copies or substantial portions of the
 Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
 FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS
 OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
 WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
 OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--- a/25
+++ b/25
@ -0,0 +1,25 @@
 # diceware
 This is the janky python code I used to help me generate the wordlist
 I describe at [https://xeno.science/dice](https://xeno.science/dice).
 This code is very bad and will probably crash if your terminal window
 is too small.  Also I'm doing something wrong with ncurses, and after
 it loads it'll show you a blank screen until you press a key.
 ## Dependencies
 You'll need Python 3 and NumPy.
 ## Usage
 First, [download the English-language ConceptNet Numberbatch
 dataset](https://conceptnet.s3.amazonaws.com/downloads/2019/numberbatch/numberbatch-en-19.08.txt.gz),
 and then run `pickle-vectors.py` to parse the Numberbatch vectors and
 store them in a pickle for later access.  This takes a while but
 you'll only need to do it once.
 Then, run
 ```bash
 diceware-editor.py [word list file]
 ```
--- a/diceware-editor.py
+++ b/diceware-editor.py
@ -0,0 +1,214 @@
 #!/usr/bin/env python3
 import curses
 import curses.ascii
 import curses.textpad
 import numpy
 import numpy.linalg
 import os.path
 import pickle
 import random
 import sys
 import textwrap
 TARGET_LIST_LENGTH = 6**4
 MAX_WORD_LENGTH = 25
 SUGGESTION_DISTANCE_THRESHOLD = 1.2
 def vectorize(word):
    return word.replace(' ', '_').replace('-', '_')
 vectors = {}
 distance_cache = {}
 def memoized_distance(w1, w2):
    w1 = vectorize(w1)
    w2 = vectorize(w2)
    if w1 not in vectors or w2 not in vectors:
        return None
    v1 = min(w1, w2)
    v2 = max(w1, w2)
    if v1 not in distance_cache:
        distance_cache[v1] = {}
    if v2 not in distance_cache[v1]:
        distance_cache[v1][v2] = numpy.linalg.norm(vectors[v1] - vectors[v2])
    return distance_cache[v1][v2]
 def prefix(word):
    return ''.join(c for c in word.lower() if curses.ascii.islower(c))[:3]
 def main(stdscr):
    global vectors
    stdscr.clear()
    stdscr.leaveok(False)
    filename = sys.argv[1]
    list_pad = curses.newpad(TARGET_LIST_LENGTH*2, MAX_WORD_LENGTH)
    status_line = curses.newwin(1, curses.COLS, curses.LINES - 1, 0)
    info_box = curses.newwin(curses.LINES - 1, MAX_WORD_LENGTH*2, 0, MAX_WORD_LENGTH)
    loading_message = "Loading conceptnet numberbatch vectors..."
    status_line.addstr(0, 0, loading_message)
    with open('numberbatch.pkl', 'rb') as f:
        vectors = pickle.load(f)
    status_line.clear()
    status_line.addstr(0, 0, "Vectors loaded!")
    status_line.refresh()
    words = {}
    if os.path.isfile(filename):
        with open(filename) as f:
            for line in f:
                word = line.strip().split(maxsplit=1)[-1]
                if len(prefix(word)) < 3:
                    continue
                words[prefix(word)] = word
    pairs = {(w1, w2) for w1 in words.values() for w2 in words.values() if w2 > w1}
    loading_message = "Precomputing word vector distances: "
    status_line.clear()
    status_line.addstr(0, 0, loading_message)
    for (i, (w1, w2)) in enumerate(pairs):
        if i % 1000 == 0:
            status_line.addstr(0, len(loading_message), "{}/{} pairs".format(i, len(pairs)))
            status_line.refresh()
        memoized_distance(w1, w2)
    suggestion_candidates = list(vectors.keys())
    random.shuffle(suggestion_candidates)
    suggestion = ""
    pos = 0
    unsaved = False
    while True:
        status_line.clear()
        status_line.addstr(0, 0, "[a]dd/[d]elete/[j]down/[k]up/[/]search/[s]uggest/[w]write/[q]uit{}".format('*' if unsaved else ''))
        status_line.noutrefresh()
        list_pad.clear()
        for (i, w) in enumerate(sorted(words.values())):
            dice = "{}{}{}{}".format(i // 6**3 + 1, (i // 6**2)% 6 + 1, (i // 6) % 6 + 1, i % 6 + 1)
            entry = dice + " " + w
            list_pad.addstr(i, 0, entry)
        scroll_pos = max(0, min(len(words)-(curses.LINES-1), pos - curses.LINES//2))
        list_pad.noutrefresh(scroll_pos,0, 0,0, curses.LINES-2,MAX_WORD_LENGTH-1)
        unknown_cn_words = {w for w in words.values() if vectorize(w) not in vectors}
        worst_distances = []
        for w1 in words.values():
            for w2 in words.values():
                if w1 >= w2:
                    continue
                d = memoized_distance(w1, w2)
                if d is not None and (len(worst_distances) == 0 or d < worst_distances[-1][2]):
                    worst_distances.append((w1, w2, d))
                    worst_distances.sort(key=lambda x: x[2])
                    worst_distances = worst_distances[:8]
        current_word = None if len(words) == 0 else list(sorted(words.values()))[pos]
        worst_current_distances = []
        for w in words.values():
            if w == current_word:
                continue
            d = memoized_distance(current_word, w)
            if d is not None and (len(worst_current_distances) == 0 or d < worst_current_distances[-1][1]):
                worst_current_distances.append((w, d))
                worst_current_distances.sort(key=lambda x: x[1])
                worst_current_distances = worst_current_distances[:8]
        info_box.clear()
        info_box.addstr(0, 0, """{count}/{target} words;
 Worst overall distances:
 {worst}
 Worst distances from current word:
 {worstc}
 Unknown (to ConceptNet) words:
 {unk_c}
 Suggestion:
 {sug}"""
                        .format(
                            count=len(words),
                            target=TARGET_LIST_LENGTH,
                            worst='\n'.join(' {} to {}, {:.2}'.format(*x) for x in worst_distances),
                            worstc='\n'.join(' {}, {:.2}'.format(*x) for x in worst_current_distances),
                            unk_c='\n '.join(unknown_cn_words if len(unknown_cn_words) <= 3 else list(unknown_cn_words)[:2] + ["..."]),
                            sug=suggestion
                        )
        )
        info_box.noutrefresh()
        curses.doupdate()
        stdscr.move(pos - scroll_pos, 0)
        ch = stdscr.getch()
        if ch == ord('-') or ch == ord('d'):
            if current_word:
                del words[prefix(current_word)]
                pos = min(max(0, len(words)-1), pos)
                unsaved = True
        elif ch == ord('+') or ch == ord('a'):
            status_line.clear()
            status_line.refresh()
            input_box = curses.textpad.Textbox(status_line)
            input_box.edit()
            word = input_box.gather().strip()
            if len(prefix(word)) >= 3:
                old = words.get(prefix(word), None)
                if old:
                    status_line.clear()
                    status_line.addstr(0,0, "Replace {}? [y/n]".format(old))
                    status_line.refresh()
                if not old or stdscr.getch() == ord('y'):
                    words[prefix(word)] = word
                    pos = sorted(words.values()).index(word)
                    unsaved = True
        elif ch == curses.KEY_DOWN or ch == ord('j'):
            pos = min(max(0, len(words)-1), pos+1)
        elif ch == curses.KEY_UP or ch == ord('k'):
            pos = max(0, pos-1)
        elif ch == ord('/'):
            status_line.clear()
            status_line.refresh()
            input_box = curses.textpad.Textbox(status_line)
            input_box.edit()
            word = input_box.gather()
            word = ''.join(c for c in word.lower() if curses.ascii.islower(c))
            if len(prefix(word)) >= 3 and prefix(word) in words:
                pos = sorted(words.values()).index(words[prefix(word)])
        elif ch == ord('s'):
            while True:
                candidate = suggestion_candidates.pop()
                if len(prefix(candidate)) < 3 or prefix(candidate) in words:
                    continue
                min_dist = None
                for word in words.values():
                    d = memoized_distance(word, candidate)
                    if d is not None and (min_dist is None or d < min_dist):
                        min_dist = d
                if min_dist is None or min_dist > SUGGESTION_DISTANCE_THRESHOLD:
                    suggestion = candidate
                    break
        elif ch == ord('w'):
            with open(filename, "w") as f:
                for (i, w) in enumerate(sorted(words.values())):
                    dice = "{}{}{}{}".format(i // 6**3 + 1, (i // 6**2)% 6 + 1, (i // 6) % 6 + 1, i % 6 + 1)
                    entry = dice + " " + w
                    f.write(entry + "\n")
            unsaved = False
        elif ch == ord('q'):
            if unsaved:
                status_line.clear()
                status_line.addstr(0,0, "Quit with unsaved data? [y/n]")
                status_line.refresh()
            if not unsaved or stdscr.getch() == ord('y'):
                return
 if len(sys.argv) != 2:
    print("usage: {} <filename>".format(sys.argv[0]), file=sys.stderr)
    exit(1)
 curses.wrapper(main)
--- a/pickle-vectors.py
+++ b/pickle-vectors.py
@ -0,0 +1,17 @@
 #!/usr/bin/env python3
 import gzip
 import numpy
 import pickle
 vectors = {}
 with gzip.open("numberbatch-en-19.08.txt.gz", mode="rt") as f:
    count = int(next(f).split()[0])
    for (i, line) in enumerate(f):
        if i % 1000 == 0:
            print("{}/{}".format(i, count))
        [word, *vec] = line.split()
        vectors[word] = numpy.array([float(x) for x in vec])
 with open('numberbatch.pkl', 'wb') as f:
    pickle.dump(vectors, f)