#!/usr/bin/python import string import sys # some t9 word-equivalences, like home/good, have amused me when I # noticed them; this is simply some code to search for them... # we don't care about punctuation or accents for now, # just letters phone_letters = { 2: "abc", 3: "def", 4: "ghi", 5: "jkl", 6: "mno", 7: "pqrs", 8: "tuv", 9: "wxyz", } letter_to_t9 = {} def makemap(): for tnum, letters in phone_letters.items(): for letter in letters: letter_to_t9[letter] = str(tnum) transfrom = [] transto = [] for letter in sorted(letter_to_t9): transfrom.append(letter) transto.append(letter_to_t9[letter]) return string.maketrans("".join(transfrom), "".join(transto)) t9map = makemap() def t9(word): """render word as t9 string""" return string.translate(word, t9map) # TODO: build table. # start with a simple hash # then see if a master set with a separate duplicates table helps def tick(cnt=[0]): sys.stderr.write(".") cnt[0] += 1 if cnt[0] % 80 == 0: sys.stderr.write("\n") sys.stderr.flush() words = {} if __name__ == "__main__": for word in sys.stdin.readlines(): word = word.strip().lower() word9 = t9(word) words[word9] = words.get(word9, set()) | set([word]) if len(words[word9]) > 2: tick() for word9 in sorted(words, key=int): if len(words[word9]) > 1: print word9, " ".join(sorted(words[word9])) # time cat /usr/share/dict/words | t9.py > t9.out # t9.out is 10k lines, 265k bytes, and 30s on the EEE # time awk '{ print NF, $0 }' < t9.out | sort -n | tail -10 # 13 7663 pome pond pone poof rome rond rone rood roof snod some sond # but mostly fails to impress me with the quality of the wordlist...