master
Raw Download raw file
  1import base64
  2import string
  3import collections
  4import csv
  5import random
  6import operator
  7from itertools import izip_longest
  8from math import sqrt
  9
 10def translator (frm='', to='', delete='', keep=None):
 11    # Function for stripping out characters we don't care about
 12    # Python Cookbook Recipe 1.9
 13    # Chris Perkins, Raymond Hettinger
 14    if len (to) == 1: to = to * len (frm)
 15    trans = string.maketrans (frm, to)
 16    if keep is not None:
 17        allchars = string.maketrans ('', '')
 18        # delete is expanded to delete everything except
 19        # what is mentioned in set(keep)-set(delete)
 20        delete = allchars.translate (allchars, keep.translate (allchars, delete))
 21    def translate (s):
 22        return s.translate (trans, delete)
 23    return translate
 24
 25def xor_string (a_string,b_string):
 26    # Given two equal length strings XOR each character and return the result
 27    if (len(a_string) != len(b_string)):
 28        print "Unequal length strings!"
 29        return None
 30    else:
 31        return ''.join(chr(ord(a) ^ ord(b)) for a,b in zip(a_string,b_string))
 32
 33def xor_char (s, c):
 34    # xor each character of the string s with the char c
 35    # mostly obsolete, use xor_str for everything and it works fine
 36    xor_string = ''
 37    for a in s:
 38        xor_string += chr (ord (a) ^ ord (c))
 39    return xor_string
 40
 41def xor_str (s, k):
 42    # xor each character of the string s with each char in k (repeating of course)
 43    out = ''
 44    index = 0
 45    for char in s:
 46        out = out + (chr (ord(char) ^ ord(k[index])))
 47        index = (index + 1) % len(k)
 48    return out
 49
 50def dict_euclidian_dist (a, b):
 51    e_sum = 0
 52    for x in a:
 53        e_sum += (float (a[x]) - float(b[x]))**2
 54    return sqrt (e_sum)
 55
 56def dict_cosine_sim (a, b):
 57    a_sq_sum = 0
 58    b_sq_sum = 0
 59    ab_sum = 0
 60    for x in a:
 61        a_sq_sum += float(a[x])**2
 62        b_sq_sum += float(b[x])**2
 63        ab_sum += float(a[x]) * float(b[x])
 64    if a_sq_sum == 0.0 or b_sq_sum == 0.0:
 65        return 0
 66    return (ab_sum / (sqrt (a_sq_sum) * sqrt (b_sq_sum)))
 67
 68def freq_array_load():
 69    # prepare the 'known' relative frequencies of letters in English
 70    # from http://www.cryptograms.org/letter-frequencies.php
 71    # http://www3.nd.edu/~busiforc/handouts/cryptography/Letter%20Frequencies.html  
 72    # http://www.cse.chalmers.se/edu/year/2010/course/TDA351/ass1/en_stat.html
 73    eng_freq = {}
 74    bigram_freq = {}
 75    trigram_freq = {}
 76    quadgram_freq = {}
 77    freq_csv = csv.DictReader (open ('letter_freq.csv', 'rb'), delimiter=',')    
 78    bigram_csv = csv.DictReader (open ('bigram_freq.csv', 'rb'), delimiter=',')    
 79    trigram_csv = csv.DictReader (open ('trigram_freq.csv', 'rb'), delimiter=',')    
 80    quadgram_csv = csv.DictReader (open ('quadgram_freq.csv', 'rb'), delimiter=',')    
 81    sum_cos_sim = {}
 82    
 83    for line in freq_csv:
 84        eng_freq[line['Letter']] = line['Frequency']
 85    for line in bigram_csv:
 86        bigram_freq[line['Bigram'].lower()] = line['Frequency']
 87    for line in trigram_csv:
 88        trigram_freq[line['Trigram'].lower()] = line['Frequency']
 89    for line in quadgram_csv:
 90        quadgram_freq[line['Quadgram'].lower()] = line['Frequency']
 91    return [(1,eng_freq), (2,bigram_freq), (3,trigram_freq), (4,quadgram_freq)] 
 92
 93
 94def ngram_freq_cos_sim (s, freq_array):
 95    # returns (key,value) where key is likley to decrypt string s
 96    # value is the sum'd cosine similarity values w.r.t. the given freq_array
 97
 98    sum_cos_sim = {}
 99    # remove non-ascii letters
100    text_filter = translator (keep=string.ascii_letters)
101    for char in string.printable:
102        sum_cos_sim[char] = 0
103        t = text_filter (xor_str (s, char)).lower()
104        for n,f in freq_array:
105            sum_cos_sim[char] += dict_cosine_sim (ngram_freq(n, t, f), f)
106           
107    key = max (sum_cos_sim, key=sum_cos_sim.get) 
108    value = sum_cos_sim[key]
109    return (key, value)
110           
111            
112def old_ngram_freq_cos_sim (s):
113    eng_freq = {}
114    bigram_freq = {}
115    trigram_freq = {}
116    quadgram_freq = {}
117    freq_csv = csv.DictReader (open ('letter_freq.csv', 'rb'), delimiter=',')    
118    bigram_csv = csv.DictReader (open ('bigram_freq.csv', 'rb'), delimiter=',')    
119    trigram_csv = csv.DictReader (open ('trigram_freq.csv', 'rb'), delimiter=',')    
120    quadgram_csv = csv.DictReader (open ('quadgram_freq.csv', 'rb'), delimiter=',')    
121    sum_cos_sim = {}
122    
123    for line in freq_csv:
124        eng_freq[line['Letter']] = line['Frequency']
125    for line in bigram_csv:
126        bigram_freq[line['Bigram'].lower()] = line['Frequency']
127    for line in trigram_csv:
128        trigram_freq[line['Trigram'].lower()] = line['Frequency']
129    for line in quadgram_csv:
130        quadgram_freq[line['Quadgram'].lower()] = line['Frequency']
131    
132    # remove non-ascii letters
133    text_filter = translator (keep=string.ascii_letters)
134    
135    # generate 'decrypted' text using each lowercase letters as the key
136    for char in string.printable:
137        sum_cos_sim[char] = 0
138        char_dec = text_filter (xor_char (s, char)).lower()
139        
140        # single character test
141        char_dec_freq = ngram_freq(1, char_dec,eng_freq)
142        sum_cos_sim[char] += dict_cosine_sim (char_dec_freq, eng_freq)    
143        
144        # bigram test
145        bigram_dec_freq = ngram_freq(2, char_dec, bigram_freq)
146        sum_cos_sim[char] += dict_cosine_sim (bigram_dec_freq, bigram_freq)
147    
148        # trigram test 
149        trigram_dec_freq = ngram_freq(3, char_dec, trigram_freq)
150        sum_cos_sim[char] += dict_cosine_sim (trigram_dec_freq, trigram_freq)
151 
152        # quadgram test
153        quadgram_dec_freq = ngram_freq(4, char_dec, quadgram_freq)
154        sum_cos_sim[char] += dict_cosine_sim (quadgram_dec_freq, quadgram_freq)
155
156    '''#
157    for t in sorted(sum_cos_sim, key=sum_cos_sim.get):
158        print t, sum_cos_sim[t]
159    '''
160    key = max (sum_cos_sim, key=sum_cos_sim.get) 
161    value = sum_cos_sim[key]
162    return (key, value)
163
164def ngram_freq (n, s, freq_dict):
165    count = collections.Counter()
166    freq = collections.defaultdict (list)
167    for i in range(0,len(s)-(n-1)):
168        count[s[i:len(s)-(len(s)-(n+i))]] += 1
169    for key in freq_dict:
170        if (len(s)-(n-1)==0):
171            freq[key] = 0.0
172        else:
173            freq[key] = count.get(key,0.0)/float(len(s)-(n-1))
174    return freq
175
176def test_xor (s, k):
177    encrypt = xor_str (s , k)
178    decrypt = xor_str (encrypt, k)
179    encrypt_again = xor_str (decrypt, k)
180    assert encrypt == encrypt_again
181    assert decrypt == s
182    return True
183
184def str_tips (s, n):
185    if n > len(s)/2:
186        return s
187    else:
188        return s[:n] + " ... " + s[-n:]
189
190def hamming_dist (s1, s2):
191    # http://en.wikipedia.org/wiki/Hamming_distance   
192    # this version specifically examines the binary of each character, not 
193    # just the characters like the wikipedia one does.
194    assert len(s1) == len(s2), "hamming_distance requires equal length strings"
195    dist = 0
196    for ch1, ch2 in zip(s1, s2):
197        dist += sum ( a!=b 
198            for a, b in zip(
199                bin(ord(ch1))[2:].zfill(8), 
200                bin(ord(ch2))[2:].zfill(8))
201        )
202    return dist
203 
204def key_len_finder (s):
205    # nhd = normalized hamming distance dict
206    nhdd = {}
207    min_nhd = (0,41.0)
208    blocks = 4
209    for keysize in xrange (1,41):
210        sum_nhd = 0
211        for i in range (blocks):
212            # normalized by keysize*8 since hamming dist is binary (char = 8 bits)
213            sum_nhd += hamming_dist (
214                    s[i*keysize:i*keysize+keysize], 
215                    s[i*keysize+keysize:i*keysize+keysize*2] 
216                ) / float (keysize*8)
217        nhd = sum_nhd / float(blocks)
218        nhdd[keysize] = nhd
219        if nhd < min_nhd[1]:
220            min_nhd = (keysize, nhd)
221    return sorted(nhdd.iteritems(), key=operator.itemgetter(1))
222
223def grouper (iterable, n, fillvalue=None):
224    "Collect data into fixed-length chunks or blocks"
225    # http://docs.python.org/2/library/itertools.html#recipes
226    # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx
227    args = [iter(iterable)] * n
228    return izip_longest(fillvalue=fillvalue, *args)
229
230def group_and_trans (s,key_len):
231    transpose = ['' for i in range (key_len)]
232    s_segments = grouper(s,key_len,'')
233    for j in s_segments:
234        for i in range(key_len):
235            transpose[i] += j[i]
236    return transpose