From 167d2086858ab269311d20098bce95662e35c2ff Mon Sep 17 00:00:00 2001 From: xenofem Date: Fri, 17 Jul 2020 04:26:54 -0400 Subject: [PATCH] Initial commit --- LICENSE | 19 ++++ README | 25 ++++++ diceware-editor.py | 214 +++++++++++++++++++++++++++++++++++++++++++++ pickle-vectors.py | 17 ++++ 4 files changed, 275 insertions(+) create mode 100644 LICENSE create mode 100644 README create mode 100755 diceware-editor.py create mode 100755 pickle-vectors.py diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..3c42862 --- /dev/null +++ b/LICENSE @@ -0,0 +1,19 @@ +MIT License Copyright (c) 2020 xenofem + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is furnished +to do so, subject to the following conditions: + +The above copyright notice and this permission notice (including the next +paragraph) shall be included in all copies or substantial portions of the +Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS +OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF +OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/README b/README new file mode 100644 index 0000000..c1a9f12 --- /dev/null +++ b/README @@ -0,0 +1,25 @@ +# diceware + +This is the janky python code I used to help me generate the wordlist +I describe at [https://xeno.science/dice](https://xeno.science/dice). +This code is very bad and will probably crash if your terminal window +is too small. Also I'm doing something wrong with ncurses, and after +it loads it'll show you a blank screen until you press a key. + +## Dependencies + +You'll need Python 3 and NumPy. + +## Usage + +First, [download the English-language ConceptNet Numberbatch +dataset](https://conceptnet.s3.amazonaws.com/downloads/2019/numberbatch/numberbatch-en-19.08.txt.gz), +and then run `pickle-vectors.py` to parse the Numberbatch vectors and +store them in a pickle for later access. This takes a while but +you'll only need to do it once. + +Then, run + +```bash +diceware-editor.py [word list file] +``` \ No newline at end of file diff --git a/diceware-editor.py b/diceware-editor.py new file mode 100755 index 0000000..46a6bad --- /dev/null +++ b/diceware-editor.py @@ -0,0 +1,214 @@ +#!/usr/bin/env python3 + +import curses +import curses.ascii +import curses.textpad +import numpy +import numpy.linalg +import os.path +import pickle +import random +import sys +import textwrap + +TARGET_LIST_LENGTH = 6**4 +MAX_WORD_LENGTH = 25 +SUGGESTION_DISTANCE_THRESHOLD = 1.2 + +def vectorize(word): + return word.replace(' ', '_').replace('-', '_') + +vectors = {} +distance_cache = {} +def memoized_distance(w1, w2): + w1 = vectorize(w1) + w2 = vectorize(w2) + if w1 not in vectors or w2 not in vectors: + return None + v1 = min(w1, w2) + v2 = max(w1, w2) + if v1 not in distance_cache: + distance_cache[v1] = {} + if v2 not in distance_cache[v1]: + distance_cache[v1][v2] = numpy.linalg.norm(vectors[v1] - vectors[v2]) + + return distance_cache[v1][v2] + +def prefix(word): + return ''.join(c for c in word.lower() if curses.ascii.islower(c))[:3] + +def main(stdscr): + global vectors + stdscr.clear() + stdscr.leaveok(False) + + filename = sys.argv[1] + + list_pad = curses.newpad(TARGET_LIST_LENGTH*2, MAX_WORD_LENGTH) + status_line = curses.newwin(1, curses.COLS, curses.LINES - 1, 0) + info_box = curses.newwin(curses.LINES - 1, MAX_WORD_LENGTH*2, 0, MAX_WORD_LENGTH) + + loading_message = "Loading conceptnet numberbatch vectors..." + status_line.addstr(0, 0, loading_message) + with open('numberbatch.pkl', 'rb') as f: + vectors = pickle.load(f) + status_line.clear() + status_line.addstr(0, 0, "Vectors loaded!") + status_line.refresh() + + words = {} + if os.path.isfile(filename): + with open(filename) as f: + for line in f: + word = line.strip().split(maxsplit=1)[-1] + if len(prefix(word)) < 3: + continue + words[prefix(word)] = word + + pairs = {(w1, w2) for w1 in words.values() for w2 in words.values() if w2 > w1} + loading_message = "Precomputing word vector distances: " + status_line.clear() + status_line.addstr(0, 0, loading_message) + for (i, (w1, w2)) in enumerate(pairs): + if i % 1000 == 0: + status_line.addstr(0, len(loading_message), "{}/{} pairs".format(i, len(pairs))) + status_line.refresh() + memoized_distance(w1, w2) + + suggestion_candidates = list(vectors.keys()) + random.shuffle(suggestion_candidates) + + suggestion = "" + pos = 0 + unsaved = False + while True: + status_line.clear() + status_line.addstr(0, 0, "[a]dd/[d]elete/[j]down/[k]up/[/]search/[s]uggest/[w]write/[q]uit{}".format('*' if unsaved else '')) + status_line.noutrefresh() + + list_pad.clear() + for (i, w) in enumerate(sorted(words.values())): + dice = "{}{}{}{}".format(i // 6**3 + 1, (i // 6**2)% 6 + 1, (i // 6) % 6 + 1, i % 6 + 1) + entry = dice + " " + w + list_pad.addstr(i, 0, entry) + scroll_pos = max(0, min(len(words)-(curses.LINES-1), pos - curses.LINES//2)) + list_pad.noutrefresh(scroll_pos,0, 0,0, curses.LINES-2,MAX_WORD_LENGTH-1) + + unknown_cn_words = {w for w in words.values() if vectorize(w) not in vectors} + + worst_distances = [] + for w1 in words.values(): + for w2 in words.values(): + if w1 >= w2: + continue + d = memoized_distance(w1, w2) + if d is not None and (len(worst_distances) == 0 or d < worst_distances[-1][2]): + worst_distances.append((w1, w2, d)) + worst_distances.sort(key=lambda x: x[2]) + worst_distances = worst_distances[:8] + + current_word = None if len(words) == 0 else list(sorted(words.values()))[pos] + worst_current_distances = [] + for w in words.values(): + if w == current_word: + continue + d = memoized_distance(current_word, w) + if d is not None and (len(worst_current_distances) == 0 or d < worst_current_distances[-1][1]): + worst_current_distances.append((w, d)) + worst_current_distances.sort(key=lambda x: x[1]) + worst_current_distances = worst_current_distances[:8] + + info_box.clear() + info_box.addstr(0, 0, """{count}/{target} words; +Worst overall distances: +{worst} +Worst distances from current word: +{worstc} +Unknown (to ConceptNet) words: + {unk_c} +Suggestion: + {sug}""" + .format( + count=len(words), + target=TARGET_LIST_LENGTH, + worst='\n'.join(' {} to {}, {:.2}'.format(*x) for x in worst_distances), + worstc='\n'.join(' {}, {:.2}'.format(*x) for x in worst_current_distances), + unk_c='\n '.join(unknown_cn_words if len(unknown_cn_words) <= 3 else list(unknown_cn_words)[:2] + ["..."]), + sug=suggestion + ) + ) + info_box.noutrefresh() + + curses.doupdate() + + stdscr.move(pos - scroll_pos, 0) + + ch = stdscr.getch() + if ch == ord('-') or ch == ord('d'): + if current_word: + del words[prefix(current_word)] + pos = min(max(0, len(words)-1), pos) + unsaved = True + elif ch == ord('+') or ch == ord('a'): + status_line.clear() + status_line.refresh() + input_box = curses.textpad.Textbox(status_line) + input_box.edit() + word = input_box.gather().strip() + if len(prefix(word)) >= 3: + old = words.get(prefix(word), None) + if old: + status_line.clear() + status_line.addstr(0,0, "Replace {}? [y/n]".format(old)) + status_line.refresh() + if not old or stdscr.getch() == ord('y'): + words[prefix(word)] = word + pos = sorted(words.values()).index(word) + unsaved = True + elif ch == curses.KEY_DOWN or ch == ord('j'): + pos = min(max(0, len(words)-1), pos+1) + elif ch == curses.KEY_UP or ch == ord('k'): + pos = max(0, pos-1) + elif ch == ord('/'): + status_line.clear() + status_line.refresh() + input_box = curses.textpad.Textbox(status_line) + input_box.edit() + word = input_box.gather() + word = ''.join(c for c in word.lower() if curses.ascii.islower(c)) + if len(prefix(word)) >= 3 and prefix(word) in words: + pos = sorted(words.values()).index(words[prefix(word)]) + elif ch == ord('s'): + while True: + candidate = suggestion_candidates.pop() + if len(prefix(candidate)) < 3 or prefix(candidate) in words: + continue + min_dist = None + for word in words.values(): + d = memoized_distance(word, candidate) + if d is not None and (min_dist is None or d < min_dist): + min_dist = d + if min_dist is None or min_dist > SUGGESTION_DISTANCE_THRESHOLD: + suggestion = candidate + break + elif ch == ord('w'): + with open(filename, "w") as f: + for (i, w) in enumerate(sorted(words.values())): + dice = "{}{}{}{}".format(i // 6**3 + 1, (i // 6**2)% 6 + 1, (i // 6) % 6 + 1, i % 6 + 1) + entry = dice + " " + w + f.write(entry + "\n") + unsaved = False + elif ch == ord('q'): + if unsaved: + status_line.clear() + status_line.addstr(0,0, "Quit with unsaved data? [y/n]") + status_line.refresh() + if not unsaved or stdscr.getch() == ord('y'): + return + + +if len(sys.argv) != 2: + print("usage: {} ".format(sys.argv[0]), file=sys.stderr) + exit(1) + +curses.wrapper(main) diff --git a/pickle-vectors.py b/pickle-vectors.py new file mode 100755 index 0000000..2ea1658 --- /dev/null +++ b/pickle-vectors.py @@ -0,0 +1,17 @@ +#!/usr/bin/env python3 + +import gzip +import numpy +import pickle + +vectors = {} +with gzip.open("numberbatch-en-19.08.txt.gz", mode="rt") as f: + count = int(next(f).split()[0]) + for (i, line) in enumerate(f): + if i % 1000 == 0: + print("{}/{}".format(i, count)) + [word, *vec] = line.split() + vectors[word] = numpy.array([float(x) for x in vec]) + +with open('numberbatch.pkl', 'wb') as f: + pickle.dump(vectors, f)