Initial commit

This commit is contained in:
xenofem 2020-07-17 04:26:54 -04:00
commit 167d208685
4 changed files with 275 additions and 0 deletions

19
LICENSE Normal file
View file

@ -0,0 +1,19 @@
MIT License Copyright (c) 2020 xenofem <xenofematxenodotscience>
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is furnished
to do so, subject to the following conditions:
The above copyright notice and this permission notice (including the next
paragraph) shall be included in all copies or substantial portions of the
Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS
OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

25
README Normal file
View file

@ -0,0 +1,25 @@
# diceware
This is the janky python code I used to help me generate the wordlist
I describe at [https://xeno.science/dice](https://xeno.science/dice).
This code is very bad and will probably crash if your terminal window
is too small. Also I'm doing something wrong with ncurses, and after
it loads it'll show you a blank screen until you press a key.
## Dependencies
You'll need Python 3 and NumPy.
## Usage
First, [download the English-language ConceptNet Numberbatch
dataset](https://conceptnet.s3.amazonaws.com/downloads/2019/numberbatch/numberbatch-en-19.08.txt.gz),
and then run `pickle-vectors.py` to parse the Numberbatch vectors and
store them in a pickle for later access. This takes a while but
you'll only need to do it once.
Then, run
```bash
diceware-editor.py [word list file]
```

214
diceware-editor.py Executable file
View file

@ -0,0 +1,214 @@
#!/usr/bin/env python3
import curses
import curses.ascii
import curses.textpad
import numpy
import numpy.linalg
import os.path
import pickle
import random
import sys
import textwrap
TARGET_LIST_LENGTH = 6**4
MAX_WORD_LENGTH = 25
SUGGESTION_DISTANCE_THRESHOLD = 1.2
def vectorize(word):
return word.replace(' ', '_').replace('-', '_')
vectors = {}
distance_cache = {}
def memoized_distance(w1, w2):
w1 = vectorize(w1)
w2 = vectorize(w2)
if w1 not in vectors or w2 not in vectors:
return None
v1 = min(w1, w2)
v2 = max(w1, w2)
if v1 not in distance_cache:
distance_cache[v1] = {}
if v2 not in distance_cache[v1]:
distance_cache[v1][v2] = numpy.linalg.norm(vectors[v1] - vectors[v2])
return distance_cache[v1][v2]
def prefix(word):
return ''.join(c for c in word.lower() if curses.ascii.islower(c))[:3]
def main(stdscr):
global vectors
stdscr.clear()
stdscr.leaveok(False)
filename = sys.argv[1]
list_pad = curses.newpad(TARGET_LIST_LENGTH*2, MAX_WORD_LENGTH)
status_line = curses.newwin(1, curses.COLS, curses.LINES - 1, 0)
info_box = curses.newwin(curses.LINES - 1, MAX_WORD_LENGTH*2, 0, MAX_WORD_LENGTH)
loading_message = "Loading conceptnet numberbatch vectors..."
status_line.addstr(0, 0, loading_message)
with open('numberbatch.pkl', 'rb') as f:
vectors = pickle.load(f)
status_line.clear()
status_line.addstr(0, 0, "Vectors loaded!")
status_line.refresh()
words = {}
if os.path.isfile(filename):
with open(filename) as f:
for line in f:
word = line.strip().split(maxsplit=1)[-1]
if len(prefix(word)) < 3:
continue
words[prefix(word)] = word
pairs = {(w1, w2) for w1 in words.values() for w2 in words.values() if w2 > w1}
loading_message = "Precomputing word vector distances: "
status_line.clear()
status_line.addstr(0, 0, loading_message)
for (i, (w1, w2)) in enumerate(pairs):
if i % 1000 == 0:
status_line.addstr(0, len(loading_message), "{}/{} pairs".format(i, len(pairs)))
status_line.refresh()
memoized_distance(w1, w2)
suggestion_candidates = list(vectors.keys())
random.shuffle(suggestion_candidates)
suggestion = ""
pos = 0
unsaved = False
while True:
status_line.clear()
status_line.addstr(0, 0, "[a]dd/[d]elete/[j]down/[k]up/[/]search/[s]uggest/[w]write/[q]uit{}".format('*' if unsaved else ''))
status_line.noutrefresh()
list_pad.clear()
for (i, w) in enumerate(sorted(words.values())):
dice = "{}{}{}{}".format(i // 6**3 + 1, (i // 6**2)% 6 + 1, (i // 6) % 6 + 1, i % 6 + 1)
entry = dice + " " + w
list_pad.addstr(i, 0, entry)
scroll_pos = max(0, min(len(words)-(curses.LINES-1), pos - curses.LINES//2))
list_pad.noutrefresh(scroll_pos,0, 0,0, curses.LINES-2,MAX_WORD_LENGTH-1)
unknown_cn_words = {w for w in words.values() if vectorize(w) not in vectors}
worst_distances = []
for w1 in words.values():
for w2 in words.values():
if w1 >= w2:
continue
d = memoized_distance(w1, w2)
if d is not None and (len(worst_distances) == 0 or d < worst_distances[-1][2]):
worst_distances.append((w1, w2, d))
worst_distances.sort(key=lambda x: x[2])
worst_distances = worst_distances[:8]
current_word = None if len(words) == 0 else list(sorted(words.values()))[pos]
worst_current_distances = []
for w in words.values():
if w == current_word:
continue
d = memoized_distance(current_word, w)
if d is not None and (len(worst_current_distances) == 0 or d < worst_current_distances[-1][1]):
worst_current_distances.append((w, d))
worst_current_distances.sort(key=lambda x: x[1])
worst_current_distances = worst_current_distances[:8]
info_box.clear()
info_box.addstr(0, 0, """{count}/{target} words;
Worst overall distances:
{worst}
Worst distances from current word:
{worstc}
Unknown (to ConceptNet) words:
{unk_c}
Suggestion:
{sug}"""
.format(
count=len(words),
target=TARGET_LIST_LENGTH,
worst='\n'.join(' {} to {}, {:.2}'.format(*x) for x in worst_distances),
worstc='\n'.join(' {}, {:.2}'.format(*x) for x in worst_current_distances),
unk_c='\n '.join(unknown_cn_words if len(unknown_cn_words) <= 3 else list(unknown_cn_words)[:2] + ["..."]),
sug=suggestion
)
)
info_box.noutrefresh()
curses.doupdate()
stdscr.move(pos - scroll_pos, 0)
ch = stdscr.getch()
if ch == ord('-') or ch == ord('d'):
if current_word:
del words[prefix(current_word)]
pos = min(max(0, len(words)-1), pos)
unsaved = True
elif ch == ord('+') or ch == ord('a'):
status_line.clear()
status_line.refresh()
input_box = curses.textpad.Textbox(status_line)
input_box.edit()
word = input_box.gather().strip()
if len(prefix(word)) >= 3:
old = words.get(prefix(word), None)
if old:
status_line.clear()
status_line.addstr(0,0, "Replace {}? [y/n]".format(old))
status_line.refresh()
if not old or stdscr.getch() == ord('y'):
words[prefix(word)] = word
pos = sorted(words.values()).index(word)
unsaved = True
elif ch == curses.KEY_DOWN or ch == ord('j'):
pos = min(max(0, len(words)-1), pos+1)
elif ch == curses.KEY_UP or ch == ord('k'):
pos = max(0, pos-1)
elif ch == ord('/'):
status_line.clear()
status_line.refresh()
input_box = curses.textpad.Textbox(status_line)
input_box.edit()
word = input_box.gather()
word = ''.join(c for c in word.lower() if curses.ascii.islower(c))
if len(prefix(word)) >= 3 and prefix(word) in words:
pos = sorted(words.values()).index(words[prefix(word)])
elif ch == ord('s'):
while True:
candidate = suggestion_candidates.pop()
if len(prefix(candidate)) < 3 or prefix(candidate) in words:
continue
min_dist = None
for word in words.values():
d = memoized_distance(word, candidate)
if d is not None and (min_dist is None or d < min_dist):
min_dist = d
if min_dist is None or min_dist > SUGGESTION_DISTANCE_THRESHOLD:
suggestion = candidate
break
elif ch == ord('w'):
with open(filename, "w") as f:
for (i, w) in enumerate(sorted(words.values())):
dice = "{}{}{}{}".format(i // 6**3 + 1, (i // 6**2)% 6 + 1, (i // 6) % 6 + 1, i % 6 + 1)
entry = dice + " " + w
f.write(entry + "\n")
unsaved = False
elif ch == ord('q'):
if unsaved:
status_line.clear()
status_line.addstr(0,0, "Quit with unsaved data? [y/n]")
status_line.refresh()
if not unsaved or stdscr.getch() == ord('y'):
return
if len(sys.argv) != 2:
print("usage: {} <filename>".format(sys.argv[0]), file=sys.stderr)
exit(1)
curses.wrapper(main)

17
pickle-vectors.py Executable file
View file

@ -0,0 +1,17 @@
#!/usr/bin/env python3
import gzip
import numpy
import pickle
vectors = {}
with gzip.open("numberbatch-en-19.08.txt.gz", mode="rt") as f:
count = int(next(f).split()[0])
for (i, line) in enumerate(f):
if i % 1000 == 0:
print("{}/{}".format(i, count))
[word, *vec] = line.split()
vectors[word] = numpy.array([float(x) for x in vec])
with open('numberbatch.pkl', 'wb') as f:
pickle.dump(vectors, f)