master
1import base64
2import string
3import collections
4import csv
5import random
6import operator
7from itertools import izip_longest
8from math import sqrt
9
10def translator (frm='', to='', delete='', keep=None):
11 # Function for stripping out characters we don't care about
12 # Python Cookbook Recipe 1.9
13 # Chris Perkins, Raymond Hettinger
14 if len (to) == 1: to = to * len (frm)
15 trans = string.maketrans (frm, to)
16 if keep is not None:
17 allchars = string.maketrans ('', '')
18 # delete is expanded to delete everything except
19 # what is mentioned in set(keep)-set(delete)
20 delete = allchars.translate (allchars, keep.translate (allchars, delete))
21 def translate (s):
22 return s.translate (trans, delete)
23 return translate
24
25def xor_string (a_string,b_string):
26 # Given two equal length strings XOR each character and return the result
27 if (len(a_string) != len(b_string)):
28 print "Unequal length strings!"
29 return None
30 else:
31 return ''.join(chr(ord(a) ^ ord(b)) for a,b in zip(a_string,b_string))
32
33def xor_char (s, c):
34 # xor each character of the string s with the char c
35 # mostly obsolete, use xor_str for everything and it works fine
36 xor_string = ''
37 for a in s:
38 xor_string += chr (ord (a) ^ ord (c))
39 return xor_string
40
41def xor_str (s, k):
42 # xor each character of the string s with each char in k (repeating of course)
43 out = ''
44 index = 0
45 for char in s:
46 out = out + (chr (ord(char) ^ ord(k[index])))
47 index = (index + 1) % len(k)
48 return out
49
50def dict_euclidian_dist (a, b):
51 e_sum = 0
52 for x in a:
53 e_sum += (float (a[x]) - float(b[x]))**2
54 return sqrt (e_sum)
55
56def dict_cosine_sim (a, b):
57 a_sq_sum = 0
58 b_sq_sum = 0
59 ab_sum = 0
60 for x in a:
61 a_sq_sum += float(a[x])**2
62 b_sq_sum += float(b[x])**2
63 ab_sum += float(a[x]) * float(b[x])
64 if a_sq_sum == 0.0 or b_sq_sum == 0.0:
65 return 0
66 return (ab_sum / (sqrt (a_sq_sum) * sqrt (b_sq_sum)))
67
68def freq_array_load():
69 # prepare the 'known' relative frequencies of letters in English
70 # from http://www.cryptograms.org/letter-frequencies.php
71 # http://www3.nd.edu/~busiforc/handouts/cryptography/Letter%20Frequencies.html
72 # http://www.cse.chalmers.se/edu/year/2010/course/TDA351/ass1/en_stat.html
73 eng_freq = {}
74 bigram_freq = {}
75 trigram_freq = {}
76 quadgram_freq = {}
77 freq_csv = csv.DictReader (open ('letter_freq.csv', 'rb'), delimiter=',')
78 bigram_csv = csv.DictReader (open ('bigram_freq.csv', 'rb'), delimiter=',')
79 trigram_csv = csv.DictReader (open ('trigram_freq.csv', 'rb'), delimiter=',')
80 quadgram_csv = csv.DictReader (open ('quadgram_freq.csv', 'rb'), delimiter=',')
81 sum_cos_sim = {}
82
83 for line in freq_csv:
84 eng_freq[line['Letter']] = line['Frequency']
85 for line in bigram_csv:
86 bigram_freq[line['Bigram'].lower()] = line['Frequency']
87 for line in trigram_csv:
88 trigram_freq[line['Trigram'].lower()] = line['Frequency']
89 for line in quadgram_csv:
90 quadgram_freq[line['Quadgram'].lower()] = line['Frequency']
91 return [(1,eng_freq), (2,bigram_freq), (3,trigram_freq), (4,quadgram_freq)]
92
93
94def ngram_freq_cos_sim (s, freq_array):
95 # returns (key,value) where key is likley to decrypt string s
96 # value is the sum'd cosine similarity values w.r.t. the given freq_array
97
98 sum_cos_sim = {}
99 # remove non-ascii letters
100 text_filter = translator (keep=string.ascii_letters)
101 for char in string.printable:
102 sum_cos_sim[char] = 0
103 t = text_filter (xor_str (s, char)).lower()
104 for n,f in freq_array:
105 sum_cos_sim[char] += dict_cosine_sim (ngram_freq(n, t, f), f)
106
107 key = max (sum_cos_sim, key=sum_cos_sim.get)
108 value = sum_cos_sim[key]
109 return (key, value)
110
111
112def old_ngram_freq_cos_sim (s):
113 eng_freq = {}
114 bigram_freq = {}
115 trigram_freq = {}
116 quadgram_freq = {}
117 freq_csv = csv.DictReader (open ('letter_freq.csv', 'rb'), delimiter=',')
118 bigram_csv = csv.DictReader (open ('bigram_freq.csv', 'rb'), delimiter=',')
119 trigram_csv = csv.DictReader (open ('trigram_freq.csv', 'rb'), delimiter=',')
120 quadgram_csv = csv.DictReader (open ('quadgram_freq.csv', 'rb'), delimiter=',')
121 sum_cos_sim = {}
122
123 for line in freq_csv:
124 eng_freq[line['Letter']] = line['Frequency']
125 for line in bigram_csv:
126 bigram_freq[line['Bigram'].lower()] = line['Frequency']
127 for line in trigram_csv:
128 trigram_freq[line['Trigram'].lower()] = line['Frequency']
129 for line in quadgram_csv:
130 quadgram_freq[line['Quadgram'].lower()] = line['Frequency']
131
132 # remove non-ascii letters
133 text_filter = translator (keep=string.ascii_letters)
134
135 # generate 'decrypted' text using each lowercase letters as the key
136 for char in string.printable:
137 sum_cos_sim[char] = 0
138 char_dec = text_filter (xor_char (s, char)).lower()
139
140 # single character test
141 char_dec_freq = ngram_freq(1, char_dec,eng_freq)
142 sum_cos_sim[char] += dict_cosine_sim (char_dec_freq, eng_freq)
143
144 # bigram test
145 bigram_dec_freq = ngram_freq(2, char_dec, bigram_freq)
146 sum_cos_sim[char] += dict_cosine_sim (bigram_dec_freq, bigram_freq)
147
148 # trigram test
149 trigram_dec_freq = ngram_freq(3, char_dec, trigram_freq)
150 sum_cos_sim[char] += dict_cosine_sim (trigram_dec_freq, trigram_freq)
151
152 # quadgram test
153 quadgram_dec_freq = ngram_freq(4, char_dec, quadgram_freq)
154 sum_cos_sim[char] += dict_cosine_sim (quadgram_dec_freq, quadgram_freq)
155
156 '''#
157 for t in sorted(sum_cos_sim, key=sum_cos_sim.get):
158 print t, sum_cos_sim[t]
159 '''
160 key = max (sum_cos_sim, key=sum_cos_sim.get)
161 value = sum_cos_sim[key]
162 return (key, value)
163
164def ngram_freq (n, s, freq_dict):
165 count = collections.Counter()
166 freq = collections.defaultdict (list)
167 for i in range(0,len(s)-(n-1)):
168 count[s[i:len(s)-(len(s)-(n+i))]] += 1
169 for key in freq_dict:
170 if (len(s)-(n-1)==0):
171 freq[key] = 0.0
172 else:
173 freq[key] = count.get(key,0.0)/float(len(s)-(n-1))
174 return freq
175
176def test_xor (s, k):
177 encrypt = xor_str (s , k)
178 decrypt = xor_str (encrypt, k)
179 encrypt_again = xor_str (decrypt, k)
180 assert encrypt == encrypt_again
181 assert decrypt == s
182 return True
183
184def str_tips (s, n):
185 if n > len(s)/2:
186 return s
187 else:
188 return s[:n] + " ... " + s[-n:]
189
190def hamming_dist (s1, s2):
191 # http://en.wikipedia.org/wiki/Hamming_distance
192 # this version specifically examines the binary of each character, not
193 # just the characters like the wikipedia one does.
194 assert len(s1) == len(s2), "hamming_distance requires equal length strings"
195 dist = 0
196 for ch1, ch2 in zip(s1, s2):
197 dist += sum ( a!=b
198 for a, b in zip(
199 bin(ord(ch1))[2:].zfill(8),
200 bin(ord(ch2))[2:].zfill(8))
201 )
202 return dist
203
204def key_len_finder (s):
205 # nhd = normalized hamming distance dict
206 nhdd = {}
207 min_nhd = (0,41.0)
208 blocks = 4
209 for keysize in xrange (1,41):
210 sum_nhd = 0
211 for i in range (blocks):
212 # normalized by keysize*8 since hamming dist is binary (char = 8 bits)
213 sum_nhd += hamming_dist (
214 s[i*keysize:i*keysize+keysize],
215 s[i*keysize+keysize:i*keysize+keysize*2]
216 ) / float (keysize*8)
217 nhd = sum_nhd / float(blocks)
218 nhdd[keysize] = nhd
219 if nhd < min_nhd[1]:
220 min_nhd = (keysize, nhd)
221 return sorted(nhdd.iteritems(), key=operator.itemgetter(1))
222
223def grouper (iterable, n, fillvalue=None):
224 "Collect data into fixed-length chunks or blocks"
225 # http://docs.python.org/2/library/itertools.html#recipes
226 # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx
227 args = [iter(iterable)] * n
228 return izip_longest(fillvalue=fillvalue, *args)
229
230def group_and_trans (s,key_len):
231 transpose = ['' for i in range (key_len)]
232 s_segments = grouper(s,key_len,'')
233 for j in s_segments:
234 for i in range(key_len):
235 transpose[i] += j[i]
236 return transpose