Commit b4453cd
Changed files (3)
quadgram_freq.csv
@@ -1,4 +1,4 @@
-Quadrigrams,Frequency
+Quadgram,Frequency
that, 0.00761242
ther, 0.00604501
with, 0.00573866
simple_crypto.py
@@ -67,59 +67,65 @@ def dict_cosine_sim (a, b):
def ngram_freq_cos_sim (s):
# returns the most likely char used to 'encrypt' the string s
- euc_dist_res = collections.defaultdict (list)
- bi_euc_dist_res = collections.defaultdict (list)
- cos_sim_res = collections.defaultdict (list)
- bi_cos_sim_res = collections.defaultdict (list)
# prepare the 'known' relative frequencies of letters in English
# from http://www.cryptograms.org/letter-frequencies.php
+ # http://www3.nd.edu/~busiforc/handouts/cryptography/Letter%20Frequencies.html
+ # http://www.cse.chalmers.se/edu/year/2010/course/TDA351/ass1/en_stat.html
eng_freq = {}
bigram_freq = {}
- freq_csv = csv.DictReader (open ('letter_freq.csv', 'rb'), delimiter=',', quotechar='"')
- bigram_csv = csv.DictReader (open ('bigram_freq2.csv', 'rb'), delimiter=',', quotechar='"')
+ trigram_freq = {}
+ quadgram_freq = {}
+ freq_csv = csv.DictReader (open ('letter_freq.csv', 'rb'), delimiter=',')
+ bigram_csv = csv.DictReader (open ('bigram_freq.csv', 'rb'), delimiter=',')
+ trigram_csv = csv.DictReader (open ('trigram_freq.csv', 'rb'), delimiter=',')
+ quadgram_csv = csv.DictReader (open ('quadgram_freq.csv', 'rb'), delimiter=',')
+ sum_cos_sim = collections.defaultdict (list)
for line in freq_csv:
eng_freq[line['Letter']] = line['Frequency']
for line in bigram_csv:
bigram_freq[line['Bigram'].lower()] = line['Frequency']
+ for line in trigram_csv:
+ trigram_freq[line['Trigram'].lower()] = line['Frequency']
+ for line in quadgram_csv:
+ quadgram_freq[line['Quadgram'].lower()] = line['Frequency']
# remove non-ascii letters
text_filter = translator (keep=string.ascii_letters)
# generate 'decrypted' text using each lowercase letters as the key
for char in string.ascii_lowercase:
+ sum_cos_sim[char] = 0
char_dec = text_filter (xor_char (s, char)).lower()
# single character test
- char_dec_freq = ngram_freq(1,char_dec,eng_freq)
- e_dist = dict_euclidian_dist (char_dec_freq, eng_freq)
- cos_sim = dict_cosine_sim (char_dec_freq, eng_freq)
- euc_dist_res[char] = e_dist
- cos_sim_res[char] = cos_sim
+ char_dec_freq = ngram_freq(1, char_dec,eng_freq)
+ sum_cos_sim[char] += dict_cosine_sim (char_dec_freq, eng_freq)
# bigram test
- bigram_dec_freq = ngram_freq(2,char_dec, bigram_freq)
- bi_e_dist = dict_euclidian_dist (bigram_dec_freq, bigram_freq)
- bi_euc_dist_res[char] = bi_e_dist
-# print bigram_dec_freq
-# print bigram_freq
- bi_cos_sim = dict_cosine_sim (bigram_dec_freq, bigram_freq)
- bi_cos_sim_res[char] = bi_cos_sim
+ bigram_dec_freq = ngram_freq(2, char_dec, bigram_freq)
+ sum_cos_sim[char] += dict_cosine_sim (bigram_dec_freq, bigram_freq)
+
+ # trigram test
+ trigram_dec_freq = ngram_freq(3, char_dec, trigram_freq)
+ sum_cos_sim[char] += dict_cosine_sim (trigram_dec_freq, trigram_freq)
+
+ # quadgram test
+ quadgram_dec_freq = ngram_freq(4, char_dec, quadgram_freq)
+ sum_cos_sim[char] += dict_cosine_sim (quadgram_dec_freq, quadgram_freq)
+
'''
- for t in sorted(bi_cos_sim_res, key=bi_cos_sim_res.get):
- print t, euc_dist_res[t], cos_sim_res[t], bi_euc_dist_res[t], bi_cos_sim_res[t]
- #print min(results, key=results.get), min(results.values())
- print "-----"
+ for t in sorted(sum_cos_sim, key=sum_cos_sim.get):
+ print t, sum_cos_sim[t]
'''
- return max (bi_cos_sim_res, key=bi_cos_sim_res.get)
+
+ return max (sum_cos_sim, key=sum_cos_sim.get)
def ngram_freq (n, s, freq_dict):
count = collections.Counter()
freq = collections.defaultdict (list)
for i in range(0,len(s)-(n-1)):
count[s[i:len(s)-(len(s)-(n+i))]] += 1
-# print count
for key in freq_dict:
freq[key] = count.get(key,0.0)/float(len(s)-(n-1))
-# print freq
return freq
trigram_freq.csv
@@ -1,4 +1,4 @@
-Trigrams,Frequency
+Trigram,Frequency
the, 0.03508232
and, 0.01593878
ing, 0.01147042