Initial commit
This commit is contained in:
		
						commit
						167d208685
					
				
					 4 changed files with 275 additions and 0 deletions
				
			
		
							
								
								
									
										19
									
								
								LICENSE
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										19
									
								
								LICENSE
									
										
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,19 @@
 | 
				
			||||||
 | 
					MIT License Copyright (c) 2020 xenofem <xenofematxenodotscience>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Permission is hereby granted, free of charge, to any person obtaining a copy
 | 
				
			||||||
 | 
					of this software and associated documentation files (the "Software"), to deal
 | 
				
			||||||
 | 
					in the Software without restriction, including without limitation the rights
 | 
				
			||||||
 | 
					to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 | 
				
			||||||
 | 
					copies of the Software, and to permit persons to whom the Software is furnished
 | 
				
			||||||
 | 
					to do so, subject to the following conditions:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The above copyright notice and this permission notice (including the next
 | 
				
			||||||
 | 
					paragraph) shall be included in all copies or substantial portions of the
 | 
				
			||||||
 | 
					Software.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 | 
				
			||||||
 | 
					IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
 | 
				
			||||||
 | 
					FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS
 | 
				
			||||||
 | 
					OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
 | 
				
			||||||
 | 
					WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
 | 
				
			||||||
 | 
					OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 | 
				
			||||||
							
								
								
									
										25
									
								
								README
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										25
									
								
								README
									
										
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,25 @@
 | 
				
			||||||
 | 
					# diceware
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					This is the janky python code I used to help me generate the wordlist
 | 
				
			||||||
 | 
					I describe at [https://xeno.science/dice](https://xeno.science/dice).
 | 
				
			||||||
 | 
					This code is very bad and will probably crash if your terminal window
 | 
				
			||||||
 | 
					is too small.  Also I'm doing something wrong with ncurses, and after
 | 
				
			||||||
 | 
					it loads it'll show you a blank screen until you press a key.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## Dependencies
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					You'll need Python 3 and NumPy.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## Usage
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					First, [download the English-language ConceptNet Numberbatch
 | 
				
			||||||
 | 
					dataset](https://conceptnet.s3.amazonaws.com/downloads/2019/numberbatch/numberbatch-en-19.08.txt.gz),
 | 
				
			||||||
 | 
					and then run `pickle-vectors.py` to parse the Numberbatch vectors and
 | 
				
			||||||
 | 
					store them in a pickle for later access.  This takes a while but
 | 
				
			||||||
 | 
					you'll only need to do it once.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Then, run
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					```bash
 | 
				
			||||||
 | 
					diceware-editor.py [word list file]
 | 
				
			||||||
 | 
					```
 | 
				
			||||||
							
								
								
									
										214
									
								
								diceware-editor.py
									
										
									
									
									
										Executable file
									
								
							
							
						
						
									
										214
									
								
								diceware-editor.py
									
										
									
									
									
										Executable file
									
								
							| 
						 | 
					@ -0,0 +1,214 @@
 | 
				
			||||||
 | 
					#!/usr/bin/env python3
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import curses
 | 
				
			||||||
 | 
					import curses.ascii
 | 
				
			||||||
 | 
					import curses.textpad
 | 
				
			||||||
 | 
					import numpy
 | 
				
			||||||
 | 
					import numpy.linalg
 | 
				
			||||||
 | 
					import os.path
 | 
				
			||||||
 | 
					import pickle
 | 
				
			||||||
 | 
					import random
 | 
				
			||||||
 | 
					import sys
 | 
				
			||||||
 | 
					import textwrap
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					TARGET_LIST_LENGTH = 6**4
 | 
				
			||||||
 | 
					MAX_WORD_LENGTH = 25
 | 
				
			||||||
 | 
					SUGGESTION_DISTANCE_THRESHOLD = 1.2
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def vectorize(word):
 | 
				
			||||||
 | 
					    return word.replace(' ', '_').replace('-', '_')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					vectors = {}
 | 
				
			||||||
 | 
					distance_cache = {}
 | 
				
			||||||
 | 
					def memoized_distance(w1, w2):
 | 
				
			||||||
 | 
					    w1 = vectorize(w1)
 | 
				
			||||||
 | 
					    w2 = vectorize(w2)
 | 
				
			||||||
 | 
					    if w1 not in vectors or w2 not in vectors:
 | 
				
			||||||
 | 
					        return None
 | 
				
			||||||
 | 
					    v1 = min(w1, w2)
 | 
				
			||||||
 | 
					    v2 = max(w1, w2)
 | 
				
			||||||
 | 
					    if v1 not in distance_cache:
 | 
				
			||||||
 | 
					        distance_cache[v1] = {}
 | 
				
			||||||
 | 
					    if v2 not in distance_cache[v1]:
 | 
				
			||||||
 | 
					        distance_cache[v1][v2] = numpy.linalg.norm(vectors[v1] - vectors[v2])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return distance_cache[v1][v2]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def prefix(word):
 | 
				
			||||||
 | 
					    return ''.join(c for c in word.lower() if curses.ascii.islower(c))[:3]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def main(stdscr):
 | 
				
			||||||
 | 
					    global vectors
 | 
				
			||||||
 | 
					    stdscr.clear()
 | 
				
			||||||
 | 
					    stdscr.leaveok(False)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    filename = sys.argv[1]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    list_pad = curses.newpad(TARGET_LIST_LENGTH*2, MAX_WORD_LENGTH)
 | 
				
			||||||
 | 
					    status_line = curses.newwin(1, curses.COLS, curses.LINES - 1, 0)
 | 
				
			||||||
 | 
					    info_box = curses.newwin(curses.LINES - 1, MAX_WORD_LENGTH*2, 0, MAX_WORD_LENGTH)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    loading_message = "Loading conceptnet numberbatch vectors..."
 | 
				
			||||||
 | 
					    status_line.addstr(0, 0, loading_message)
 | 
				
			||||||
 | 
					    with open('numberbatch.pkl', 'rb') as f:
 | 
				
			||||||
 | 
					        vectors = pickle.load(f)
 | 
				
			||||||
 | 
					    status_line.clear()
 | 
				
			||||||
 | 
					    status_line.addstr(0, 0, "Vectors loaded!")
 | 
				
			||||||
 | 
					    status_line.refresh()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    words = {}
 | 
				
			||||||
 | 
					    if os.path.isfile(filename):
 | 
				
			||||||
 | 
					        with open(filename) as f:
 | 
				
			||||||
 | 
					            for line in f:
 | 
				
			||||||
 | 
					                word = line.strip().split(maxsplit=1)[-1]
 | 
				
			||||||
 | 
					                if len(prefix(word)) < 3:
 | 
				
			||||||
 | 
					                    continue
 | 
				
			||||||
 | 
					                words[prefix(word)] = word
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    pairs = {(w1, w2) for w1 in words.values() for w2 in words.values() if w2 > w1}
 | 
				
			||||||
 | 
					    loading_message = "Precomputing word vector distances: "
 | 
				
			||||||
 | 
					    status_line.clear()
 | 
				
			||||||
 | 
					    status_line.addstr(0, 0, loading_message)
 | 
				
			||||||
 | 
					    for (i, (w1, w2)) in enumerate(pairs):
 | 
				
			||||||
 | 
					        if i % 1000 == 0:
 | 
				
			||||||
 | 
					            status_line.addstr(0, len(loading_message), "{}/{} pairs".format(i, len(pairs)))
 | 
				
			||||||
 | 
					            status_line.refresh()
 | 
				
			||||||
 | 
					        memoized_distance(w1, w2)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    suggestion_candidates = list(vectors.keys())
 | 
				
			||||||
 | 
					    random.shuffle(suggestion_candidates)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    suggestion = ""
 | 
				
			||||||
 | 
					    pos = 0
 | 
				
			||||||
 | 
					    unsaved = False
 | 
				
			||||||
 | 
					    while True:
 | 
				
			||||||
 | 
					        status_line.clear()
 | 
				
			||||||
 | 
					        status_line.addstr(0, 0, "[a]dd/[d]elete/[j]down/[k]up/[/]search/[s]uggest/[w]write/[q]uit{}".format('*' if unsaved else ''))
 | 
				
			||||||
 | 
					        status_line.noutrefresh()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        list_pad.clear()
 | 
				
			||||||
 | 
					        for (i, w) in enumerate(sorted(words.values())):
 | 
				
			||||||
 | 
					            dice = "{}{}{}{}".format(i // 6**3 + 1, (i // 6**2)% 6 + 1, (i // 6) % 6 + 1, i % 6 + 1)
 | 
				
			||||||
 | 
					            entry = dice + " " + w
 | 
				
			||||||
 | 
					            list_pad.addstr(i, 0, entry)
 | 
				
			||||||
 | 
					        scroll_pos = max(0, min(len(words)-(curses.LINES-1), pos - curses.LINES//2))
 | 
				
			||||||
 | 
					        list_pad.noutrefresh(scroll_pos,0, 0,0, curses.LINES-2,MAX_WORD_LENGTH-1)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        unknown_cn_words = {w for w in words.values() if vectorize(w) not in vectors}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        worst_distances = []
 | 
				
			||||||
 | 
					        for w1 in words.values():
 | 
				
			||||||
 | 
					            for w2 in words.values():
 | 
				
			||||||
 | 
					                if w1 >= w2:
 | 
				
			||||||
 | 
					                    continue
 | 
				
			||||||
 | 
					                d = memoized_distance(w1, w2)
 | 
				
			||||||
 | 
					                if d is not None and (len(worst_distances) == 0 or d < worst_distances[-1][2]):
 | 
				
			||||||
 | 
					                    worst_distances.append((w1, w2, d))
 | 
				
			||||||
 | 
					                    worst_distances.sort(key=lambda x: x[2])
 | 
				
			||||||
 | 
					                    worst_distances = worst_distances[:8]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        current_word = None if len(words) == 0 else list(sorted(words.values()))[pos]
 | 
				
			||||||
 | 
					        worst_current_distances = []
 | 
				
			||||||
 | 
					        for w in words.values():
 | 
				
			||||||
 | 
					            if w == current_word:
 | 
				
			||||||
 | 
					                continue
 | 
				
			||||||
 | 
					            d = memoized_distance(current_word, w)
 | 
				
			||||||
 | 
					            if d is not None and (len(worst_current_distances) == 0 or d < worst_current_distances[-1][1]):
 | 
				
			||||||
 | 
					                worst_current_distances.append((w, d))
 | 
				
			||||||
 | 
					                worst_current_distances.sort(key=lambda x: x[1])
 | 
				
			||||||
 | 
					                worst_current_distances = worst_current_distances[:8]
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        info_box.clear()
 | 
				
			||||||
 | 
					        info_box.addstr(0, 0, """{count}/{target} words;
 | 
				
			||||||
 | 
					Worst overall distances:
 | 
				
			||||||
 | 
					{worst}
 | 
				
			||||||
 | 
					Worst distances from current word:
 | 
				
			||||||
 | 
					{worstc}
 | 
				
			||||||
 | 
					Unknown (to ConceptNet) words:
 | 
				
			||||||
 | 
					 {unk_c}
 | 
				
			||||||
 | 
					Suggestion:
 | 
				
			||||||
 | 
					 {sug}"""
 | 
				
			||||||
 | 
					                        .format(
 | 
				
			||||||
 | 
					                            count=len(words),
 | 
				
			||||||
 | 
					                            target=TARGET_LIST_LENGTH,
 | 
				
			||||||
 | 
					                            worst='\n'.join(' {} to {}, {:.2}'.format(*x) for x in worst_distances),
 | 
				
			||||||
 | 
					                            worstc='\n'.join(' {}, {:.2}'.format(*x) for x in worst_current_distances),
 | 
				
			||||||
 | 
					                            unk_c='\n '.join(unknown_cn_words if len(unknown_cn_words) <= 3 else list(unknown_cn_words)[:2] + ["..."]),
 | 
				
			||||||
 | 
					                            sug=suggestion
 | 
				
			||||||
 | 
					                        )
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					        info_box.noutrefresh()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        curses.doupdate()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        stdscr.move(pos - scroll_pos, 0)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        ch = stdscr.getch()
 | 
				
			||||||
 | 
					        if ch == ord('-') or ch == ord('d'):
 | 
				
			||||||
 | 
					            if current_word:
 | 
				
			||||||
 | 
					                del words[prefix(current_word)]
 | 
				
			||||||
 | 
					                pos = min(max(0, len(words)-1), pos)
 | 
				
			||||||
 | 
					                unsaved = True
 | 
				
			||||||
 | 
					        elif ch == ord('+') or ch == ord('a'):
 | 
				
			||||||
 | 
					            status_line.clear()
 | 
				
			||||||
 | 
					            status_line.refresh()
 | 
				
			||||||
 | 
					            input_box = curses.textpad.Textbox(status_line)
 | 
				
			||||||
 | 
					            input_box.edit()
 | 
				
			||||||
 | 
					            word = input_box.gather().strip()
 | 
				
			||||||
 | 
					            if len(prefix(word)) >= 3:
 | 
				
			||||||
 | 
					                old = words.get(prefix(word), None)
 | 
				
			||||||
 | 
					                if old:
 | 
				
			||||||
 | 
					                    status_line.clear()
 | 
				
			||||||
 | 
					                    status_line.addstr(0,0, "Replace {}? [y/n]".format(old))
 | 
				
			||||||
 | 
					                    status_line.refresh()
 | 
				
			||||||
 | 
					                if not old or stdscr.getch() == ord('y'):
 | 
				
			||||||
 | 
					                    words[prefix(word)] = word
 | 
				
			||||||
 | 
					                    pos = sorted(words.values()).index(word)
 | 
				
			||||||
 | 
					                    unsaved = True
 | 
				
			||||||
 | 
					        elif ch == curses.KEY_DOWN or ch == ord('j'):
 | 
				
			||||||
 | 
					            pos = min(max(0, len(words)-1), pos+1)
 | 
				
			||||||
 | 
					        elif ch == curses.KEY_UP or ch == ord('k'):
 | 
				
			||||||
 | 
					            pos = max(0, pos-1)
 | 
				
			||||||
 | 
					        elif ch == ord('/'):
 | 
				
			||||||
 | 
					            status_line.clear()
 | 
				
			||||||
 | 
					            status_line.refresh()
 | 
				
			||||||
 | 
					            input_box = curses.textpad.Textbox(status_line)
 | 
				
			||||||
 | 
					            input_box.edit()
 | 
				
			||||||
 | 
					            word = input_box.gather()
 | 
				
			||||||
 | 
					            word = ''.join(c for c in word.lower() if curses.ascii.islower(c))
 | 
				
			||||||
 | 
					            if len(prefix(word)) >= 3 and prefix(word) in words:
 | 
				
			||||||
 | 
					                pos = sorted(words.values()).index(words[prefix(word)])
 | 
				
			||||||
 | 
					        elif ch == ord('s'):
 | 
				
			||||||
 | 
					            while True:
 | 
				
			||||||
 | 
					                candidate = suggestion_candidates.pop()
 | 
				
			||||||
 | 
					                if len(prefix(candidate)) < 3 or prefix(candidate) in words:
 | 
				
			||||||
 | 
					                    continue
 | 
				
			||||||
 | 
					                min_dist = None
 | 
				
			||||||
 | 
					                for word in words.values():
 | 
				
			||||||
 | 
					                    d = memoized_distance(word, candidate)
 | 
				
			||||||
 | 
					                    if d is not None and (min_dist is None or d < min_dist):
 | 
				
			||||||
 | 
					                        min_dist = d
 | 
				
			||||||
 | 
					                if min_dist is None or min_dist > SUGGESTION_DISTANCE_THRESHOLD:
 | 
				
			||||||
 | 
					                    suggestion = candidate
 | 
				
			||||||
 | 
					                    break
 | 
				
			||||||
 | 
					        elif ch == ord('w'):
 | 
				
			||||||
 | 
					            with open(filename, "w") as f:
 | 
				
			||||||
 | 
					                for (i, w) in enumerate(sorted(words.values())):
 | 
				
			||||||
 | 
					                    dice = "{}{}{}{}".format(i // 6**3 + 1, (i // 6**2)% 6 + 1, (i // 6) % 6 + 1, i % 6 + 1)
 | 
				
			||||||
 | 
					                    entry = dice + " " + w
 | 
				
			||||||
 | 
					                    f.write(entry + "\n")
 | 
				
			||||||
 | 
					            unsaved = False
 | 
				
			||||||
 | 
					        elif ch == ord('q'):
 | 
				
			||||||
 | 
					            if unsaved:
 | 
				
			||||||
 | 
					                status_line.clear()
 | 
				
			||||||
 | 
					                status_line.addstr(0,0, "Quit with unsaved data? [y/n]")
 | 
				
			||||||
 | 
					                status_line.refresh()
 | 
				
			||||||
 | 
					            if not unsaved or stdscr.getch() == ord('y'):
 | 
				
			||||||
 | 
					                return
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					if len(sys.argv) != 2:
 | 
				
			||||||
 | 
					    print("usage: {} <filename>".format(sys.argv[0]), file=sys.stderr)
 | 
				
			||||||
 | 
					    exit(1)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					curses.wrapper(main)
 | 
				
			||||||
							
								
								
									
										17
									
								
								pickle-vectors.py
									
										
									
									
									
										Executable file
									
								
							
							
						
						
									
										17
									
								
								pickle-vectors.py
									
										
									
									
									
										Executable file
									
								
							| 
						 | 
					@ -0,0 +1,17 @@
 | 
				
			||||||
 | 
					#!/usr/bin/env python3
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import gzip
 | 
				
			||||||
 | 
					import numpy
 | 
				
			||||||
 | 
					import pickle
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					vectors = {}
 | 
				
			||||||
 | 
					with gzip.open("numberbatch-en-19.08.txt.gz", mode="rt") as f:
 | 
				
			||||||
 | 
					    count = int(next(f).split()[0])
 | 
				
			||||||
 | 
					    for (i, line) in enumerate(f):
 | 
				
			||||||
 | 
					        if i % 1000 == 0:
 | 
				
			||||||
 | 
					            print("{}/{}".format(i, count))
 | 
				
			||||||
 | 
					        [word, *vec] = line.split()
 | 
				
			||||||
 | 
					        vectors[word] = numpy.array([float(x) for x in vec])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					with open('numberbatch.pkl', 'wb') as f:
 | 
				
			||||||
 | 
					    pickle.dump(vectors, f)
 | 
				
			||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue