Initial commit
This commit is contained in:
		
						commit
						167d208685
					
				
					 4 changed files with 275 additions and 0 deletions
				
			
		
							
								
								
									
										19
									
								
								LICENSE
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										19
									
								
								LICENSE
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,19 @@ | |||
| MIT License Copyright (c) 2020 xenofem <xenofematxenodotscience> | ||||
| 
 | ||||
| Permission is hereby granted, free of charge, to any person obtaining a copy | ||||
| of this software and associated documentation files (the "Software"), to deal | ||||
| in the Software without restriction, including without limitation the rights | ||||
| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||||
| copies of the Software, and to permit persons to whom the Software is furnished | ||||
| to do so, subject to the following conditions: | ||||
| 
 | ||||
| The above copyright notice and this permission notice (including the next | ||||
| paragraph) shall be included in all copies or substantial portions of the | ||||
| Software. | ||||
| 
 | ||||
| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||||
| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS | ||||
| FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS | ||||
| OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, | ||||
| WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF | ||||
| OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | ||||
							
								
								
									
										25
									
								
								README
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										25
									
								
								README
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,25 @@ | |||
| # diceware | ||||
| 
 | ||||
| This is the janky python code I used to help me generate the wordlist | ||||
| I describe at [https://xeno.science/dice](https://xeno.science/dice). | ||||
| This code is very bad and will probably crash if your terminal window | ||||
| is too small.  Also I'm doing something wrong with ncurses, and after | ||||
| it loads it'll show you a blank screen until you press a key. | ||||
| 
 | ||||
| ## Dependencies | ||||
| 
 | ||||
| You'll need Python 3 and NumPy. | ||||
| 
 | ||||
| ## Usage | ||||
| 
 | ||||
| First, [download the English-language ConceptNet Numberbatch | ||||
| dataset](https://conceptnet.s3.amazonaws.com/downloads/2019/numberbatch/numberbatch-en-19.08.txt.gz), | ||||
| and then run `pickle-vectors.py` to parse the Numberbatch vectors and | ||||
| store them in a pickle for later access.  This takes a while but | ||||
| you'll only need to do it once. | ||||
| 
 | ||||
| Then, run | ||||
| 
 | ||||
| ```bash | ||||
| diceware-editor.py [word list file] | ||||
| ``` | ||||
							
								
								
									
										214
									
								
								diceware-editor.py
									
										
									
									
									
										Executable file
									
								
							
							
						
						
									
										214
									
								
								diceware-editor.py
									
										
									
									
									
										Executable file
									
								
							|  | @ -0,0 +1,214 @@ | |||
| #!/usr/bin/env python3 | ||||
| 
 | ||||
| import curses | ||||
| import curses.ascii | ||||
| import curses.textpad | ||||
| import numpy | ||||
| import numpy.linalg | ||||
| import os.path | ||||
| import pickle | ||||
| import random | ||||
| import sys | ||||
| import textwrap | ||||
| 
 | ||||
| TARGET_LIST_LENGTH = 6**4 | ||||
| MAX_WORD_LENGTH = 25 | ||||
| SUGGESTION_DISTANCE_THRESHOLD = 1.2 | ||||
| 
 | ||||
| def vectorize(word): | ||||
|     return word.replace(' ', '_').replace('-', '_') | ||||
| 
 | ||||
| vectors = {} | ||||
| distance_cache = {} | ||||
| def memoized_distance(w1, w2): | ||||
|     w1 = vectorize(w1) | ||||
|     w2 = vectorize(w2) | ||||
|     if w1 not in vectors or w2 not in vectors: | ||||
|         return None | ||||
|     v1 = min(w1, w2) | ||||
|     v2 = max(w1, w2) | ||||
|     if v1 not in distance_cache: | ||||
|         distance_cache[v1] = {} | ||||
|     if v2 not in distance_cache[v1]: | ||||
|         distance_cache[v1][v2] = numpy.linalg.norm(vectors[v1] - vectors[v2]) | ||||
| 
 | ||||
|     return distance_cache[v1][v2] | ||||
| 
 | ||||
| def prefix(word): | ||||
|     return ''.join(c for c in word.lower() if curses.ascii.islower(c))[:3] | ||||
| 
 | ||||
| def main(stdscr): | ||||
|     global vectors | ||||
|     stdscr.clear() | ||||
|     stdscr.leaveok(False) | ||||
| 
 | ||||
|     filename = sys.argv[1] | ||||
| 
 | ||||
|     list_pad = curses.newpad(TARGET_LIST_LENGTH*2, MAX_WORD_LENGTH) | ||||
|     status_line = curses.newwin(1, curses.COLS, curses.LINES - 1, 0) | ||||
|     info_box = curses.newwin(curses.LINES - 1, MAX_WORD_LENGTH*2, 0, MAX_WORD_LENGTH) | ||||
| 
 | ||||
|     loading_message = "Loading conceptnet numberbatch vectors..." | ||||
|     status_line.addstr(0, 0, loading_message) | ||||
|     with open('numberbatch.pkl', 'rb') as f: | ||||
|         vectors = pickle.load(f) | ||||
|     status_line.clear() | ||||
|     status_line.addstr(0, 0, "Vectors loaded!") | ||||
|     status_line.refresh() | ||||
| 
 | ||||
|     words = {} | ||||
|     if os.path.isfile(filename): | ||||
|         with open(filename) as f: | ||||
|             for line in f: | ||||
|                 word = line.strip().split(maxsplit=1)[-1] | ||||
|                 if len(prefix(word)) < 3: | ||||
|                     continue | ||||
|                 words[prefix(word)] = word | ||||
| 
 | ||||
|     pairs = {(w1, w2) for w1 in words.values() for w2 in words.values() if w2 > w1} | ||||
|     loading_message = "Precomputing word vector distances: " | ||||
|     status_line.clear() | ||||
|     status_line.addstr(0, 0, loading_message) | ||||
|     for (i, (w1, w2)) in enumerate(pairs): | ||||
|         if i % 1000 == 0: | ||||
|             status_line.addstr(0, len(loading_message), "{}/{} pairs".format(i, len(pairs))) | ||||
|             status_line.refresh() | ||||
|         memoized_distance(w1, w2) | ||||
| 
 | ||||
|     suggestion_candidates = list(vectors.keys()) | ||||
|     random.shuffle(suggestion_candidates) | ||||
| 
 | ||||
|     suggestion = "" | ||||
|     pos = 0 | ||||
|     unsaved = False | ||||
|     while True: | ||||
|         status_line.clear() | ||||
|         status_line.addstr(0, 0, "[a]dd/[d]elete/[j]down/[k]up/[/]search/[s]uggest/[w]write/[q]uit{}".format('*' if unsaved else '')) | ||||
|         status_line.noutrefresh() | ||||
| 
 | ||||
|         list_pad.clear() | ||||
|         for (i, w) in enumerate(sorted(words.values())): | ||||
|             dice = "{}{}{}{}".format(i // 6**3 + 1, (i // 6**2)% 6 + 1, (i // 6) % 6 + 1, i % 6 + 1) | ||||
|             entry = dice + " " + w | ||||
|             list_pad.addstr(i, 0, entry) | ||||
|         scroll_pos = max(0, min(len(words)-(curses.LINES-1), pos - curses.LINES//2)) | ||||
|         list_pad.noutrefresh(scroll_pos,0, 0,0, curses.LINES-2,MAX_WORD_LENGTH-1) | ||||
| 
 | ||||
|         unknown_cn_words = {w for w in words.values() if vectorize(w) not in vectors} | ||||
| 
 | ||||
|         worst_distances = [] | ||||
|         for w1 in words.values(): | ||||
|             for w2 in words.values(): | ||||
|                 if w1 >= w2: | ||||
|                     continue | ||||
|                 d = memoized_distance(w1, w2) | ||||
|                 if d is not None and (len(worst_distances) == 0 or d < worst_distances[-1][2]): | ||||
|                     worst_distances.append((w1, w2, d)) | ||||
|                     worst_distances.sort(key=lambda x: x[2]) | ||||
|                     worst_distances = worst_distances[:8] | ||||
| 
 | ||||
|         current_word = None if len(words) == 0 else list(sorted(words.values()))[pos] | ||||
|         worst_current_distances = [] | ||||
|         for w in words.values(): | ||||
|             if w == current_word: | ||||
|                 continue | ||||
|             d = memoized_distance(current_word, w) | ||||
|             if d is not None and (len(worst_current_distances) == 0 or d < worst_current_distances[-1][1]): | ||||
|                 worst_current_distances.append((w, d)) | ||||
|                 worst_current_distances.sort(key=lambda x: x[1]) | ||||
|                 worst_current_distances = worst_current_distances[:8] | ||||
|          | ||||
|         info_box.clear() | ||||
|         info_box.addstr(0, 0, """{count}/{target} words; | ||||
| Worst overall distances: | ||||
| {worst} | ||||
| Worst distances from current word: | ||||
| {worstc} | ||||
| Unknown (to ConceptNet) words: | ||||
|  {unk_c} | ||||
| Suggestion: | ||||
|  {sug}""" | ||||
|                         .format( | ||||
|                             count=len(words), | ||||
|                             target=TARGET_LIST_LENGTH, | ||||
|                             worst='\n'.join(' {} to {}, {:.2}'.format(*x) for x in worst_distances), | ||||
|                             worstc='\n'.join(' {}, {:.2}'.format(*x) for x in worst_current_distances), | ||||
|                             unk_c='\n '.join(unknown_cn_words if len(unknown_cn_words) <= 3 else list(unknown_cn_words)[:2] + ["..."]), | ||||
|                             sug=suggestion | ||||
|                         ) | ||||
|         ) | ||||
|         info_box.noutrefresh() | ||||
| 
 | ||||
|         curses.doupdate() | ||||
| 
 | ||||
|         stdscr.move(pos - scroll_pos, 0) | ||||
| 
 | ||||
|         ch = stdscr.getch() | ||||
|         if ch == ord('-') or ch == ord('d'): | ||||
|             if current_word: | ||||
|                 del words[prefix(current_word)] | ||||
|                 pos = min(max(0, len(words)-1), pos) | ||||
|                 unsaved = True | ||||
|         elif ch == ord('+') or ch == ord('a'): | ||||
|             status_line.clear() | ||||
|             status_line.refresh() | ||||
|             input_box = curses.textpad.Textbox(status_line) | ||||
|             input_box.edit() | ||||
|             word = input_box.gather().strip() | ||||
|             if len(prefix(word)) >= 3: | ||||
|                 old = words.get(prefix(word), None) | ||||
|                 if old: | ||||
|                     status_line.clear() | ||||
|                     status_line.addstr(0,0, "Replace {}? [y/n]".format(old)) | ||||
|                     status_line.refresh() | ||||
|                 if not old or stdscr.getch() == ord('y'): | ||||
|                     words[prefix(word)] = word | ||||
|                     pos = sorted(words.values()).index(word) | ||||
|                     unsaved = True | ||||
|         elif ch == curses.KEY_DOWN or ch == ord('j'): | ||||
|             pos = min(max(0, len(words)-1), pos+1) | ||||
|         elif ch == curses.KEY_UP or ch == ord('k'): | ||||
|             pos = max(0, pos-1) | ||||
|         elif ch == ord('/'): | ||||
|             status_line.clear() | ||||
|             status_line.refresh() | ||||
|             input_box = curses.textpad.Textbox(status_line) | ||||
|             input_box.edit() | ||||
|             word = input_box.gather() | ||||
|             word = ''.join(c for c in word.lower() if curses.ascii.islower(c)) | ||||
|             if len(prefix(word)) >= 3 and prefix(word) in words: | ||||
|                 pos = sorted(words.values()).index(words[prefix(word)]) | ||||
|         elif ch == ord('s'): | ||||
|             while True: | ||||
|                 candidate = suggestion_candidates.pop() | ||||
|                 if len(prefix(candidate)) < 3 or prefix(candidate) in words: | ||||
|                     continue | ||||
|                 min_dist = None | ||||
|                 for word in words.values(): | ||||
|                     d = memoized_distance(word, candidate) | ||||
|                     if d is not None and (min_dist is None or d < min_dist): | ||||
|                         min_dist = d | ||||
|                 if min_dist is None or min_dist > SUGGESTION_DISTANCE_THRESHOLD: | ||||
|                     suggestion = candidate | ||||
|                     break | ||||
|         elif ch == ord('w'): | ||||
|             with open(filename, "w") as f: | ||||
|                 for (i, w) in enumerate(sorted(words.values())): | ||||
|                     dice = "{}{}{}{}".format(i // 6**3 + 1, (i // 6**2)% 6 + 1, (i // 6) % 6 + 1, i % 6 + 1) | ||||
|                     entry = dice + " " + w | ||||
|                     f.write(entry + "\n") | ||||
|             unsaved = False | ||||
|         elif ch == ord('q'): | ||||
|             if unsaved: | ||||
|                 status_line.clear() | ||||
|                 status_line.addstr(0,0, "Quit with unsaved data? [y/n]") | ||||
|                 status_line.refresh() | ||||
|             if not unsaved or stdscr.getch() == ord('y'): | ||||
|                 return | ||||
| 
 | ||||
| 
 | ||||
| if len(sys.argv) != 2: | ||||
|     print("usage: {} <filename>".format(sys.argv[0]), file=sys.stderr) | ||||
|     exit(1) | ||||
| 
 | ||||
| curses.wrapper(main) | ||||
							
								
								
									
										17
									
								
								pickle-vectors.py
									
										
									
									
									
										Executable file
									
								
							
							
						
						
									
										17
									
								
								pickle-vectors.py
									
										
									
									
									
										Executable file
									
								
							|  | @ -0,0 +1,17 @@ | |||
| #!/usr/bin/env python3 | ||||
| 
 | ||||
| import gzip | ||||
| import numpy | ||||
| import pickle | ||||
| 
 | ||||
| vectors = {} | ||||
| with gzip.open("numberbatch-en-19.08.txt.gz", mode="rt") as f: | ||||
|     count = int(next(f).split()[0]) | ||||
|     for (i, line) in enumerate(f): | ||||
|         if i % 1000 == 0: | ||||
|             print("{}/{}".format(i, count)) | ||||
|         [word, *vec] = line.split() | ||||
|         vectors[word] = numpy.array([float(x) for x in vec]) | ||||
| 
 | ||||
| with open('numberbatch.pkl', 'wb') as f: | ||||
|     pickle.dump(vectors, f) | ||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue