Commit b4453cd

bryon <bryon.fryer@gmail.com>
2013-05-17 23:23:48
working uni-bi-tri-quad gram, cleaned up code:w
1 parent 84cdc5f
quadgram_freq.csv
@@ -1,4 +1,4 @@
-Quadrigrams,Frequency
+Quadgram,Frequency
 that, 0.00761242
 ther, 0.00604501
 with, 0.00573866
simple_crypto.py
@@ -67,59 +67,65 @@ def dict_cosine_sim (a, b):
  
 def ngram_freq_cos_sim (s):
     # returns the most likely char used to 'encrypt' the string s
-    euc_dist_res = collections.defaultdict (list)
-    bi_euc_dist_res = collections.defaultdict (list)
-    cos_sim_res = collections.defaultdict (list)
-    bi_cos_sim_res = collections.defaultdict (list)
 
     # prepare the 'known' relative frequencies of letters in English
     # from http://www.cryptograms.org/letter-frequencies.php
+    # http://www3.nd.edu/~busiforc/handouts/cryptography/Letter%20Frequencies.html  
+    # http://www.cse.chalmers.se/edu/year/2010/course/TDA351/ass1/en_stat.html
     eng_freq = {}
     bigram_freq = {}
-    freq_csv = csv.DictReader (open ('letter_freq.csv', 'rb'), delimiter=',', quotechar='"')    
-    bigram_csv = csv.DictReader (open ('bigram_freq2.csv', 'rb'), delimiter=',', quotechar='"')    
+    trigram_freq = {}
+    quadgram_freq = {}
+    freq_csv = csv.DictReader (open ('letter_freq.csv', 'rb'), delimiter=',')    
+    bigram_csv = csv.DictReader (open ('bigram_freq.csv', 'rb'), delimiter=',')    
+    trigram_csv = csv.DictReader (open ('trigram_freq.csv', 'rb'), delimiter=',')    
+    quadgram_csv = csv.DictReader (open ('quadgram_freq.csv', 'rb'), delimiter=',')    
+    sum_cos_sim = collections.defaultdict (list)
     for line in freq_csv:
         eng_freq[line['Letter']] = line['Frequency']
     for line in bigram_csv:
         bigram_freq[line['Bigram'].lower()] = line['Frequency']
+    for line in trigram_csv:
+        trigram_freq[line['Trigram'].lower()] = line['Frequency']
+    for line in quadgram_csv:
+        quadgram_freq[line['Quadgram'].lower()] = line['Frequency']
     
     # remove non-ascii letters
     text_filter = translator (keep=string.ascii_letters)
     
     # generate 'decrypted' text using each lowercase letters as the key
     for char in string.ascii_lowercase:
+        sum_cos_sim[char] = 0
         char_dec = text_filter (xor_char (s, char)).lower()
         
         # single character test
-        char_dec_freq = ngram_freq(1,char_dec,eng_freq)
-        e_dist = dict_euclidian_dist (char_dec_freq, eng_freq)
-        cos_sim = dict_cosine_sim (char_dec_freq, eng_freq)    
-        euc_dist_res[char] = e_dist 
-        cos_sim_res[char] = cos_sim
+        char_dec_freq = ngram_freq(1, char_dec,eng_freq)
+        sum_cos_sim[char] += dict_cosine_sim (char_dec_freq, eng_freq)    
         
         # bigram test
-        bigram_dec_freq = ngram_freq(2,char_dec, bigram_freq)
-        bi_e_dist = dict_euclidian_dist (bigram_dec_freq, bigram_freq)
-        bi_euc_dist_res[char] = bi_e_dist
-#        print bigram_dec_freq
-#        print bigram_freq
-        bi_cos_sim = dict_cosine_sim (bigram_dec_freq, bigram_freq)
-        bi_cos_sim_res[char] = bi_cos_sim
+        bigram_dec_freq = ngram_freq(2, char_dec, bigram_freq)
+        sum_cos_sim[char] += dict_cosine_sim (bigram_dec_freq, bigram_freq)
+    
+        # trigram test 
+        trigram_dec_freq = ngram_freq(3, char_dec, trigram_freq)
+        sum_cos_sim[char] += dict_cosine_sim (trigram_dec_freq, trigram_freq)
+ 
+        # quadgram test
+        quadgram_dec_freq = ngram_freq(4, char_dec, quadgram_freq)
+        sum_cos_sim[char] += dict_cosine_sim (quadgram_dec_freq, quadgram_freq)
+
     '''
-    for t in sorted(bi_cos_sim_res, key=bi_cos_sim_res.get):
-        print t, euc_dist_res[t], cos_sim_res[t], bi_euc_dist_res[t], bi_cos_sim_res[t]
-    #print min(results, key=results.get), min(results.values())
-    print "-----"
+    for t in sorted(sum_cos_sim, key=sum_cos_sim.get):
+        print t, sum_cos_sim[t]
     '''
-    return max (bi_cos_sim_res, key=bi_cos_sim_res.get)
+
+    return max (sum_cos_sim, key=sum_cos_sim.get)
 
 def ngram_freq (n, s, freq_dict):
     count = collections.Counter()
     freq = collections.defaultdict (list)
     for i in range(0,len(s)-(n-1)):
         count[s[i:len(s)-(len(s)-(n+i))]] += 1
-#    print count
     for key in freq_dict:
         freq[key] = count.get(key,0.0)/float(len(s)-(n-1))
-#    print freq
     return freq
trigram_freq.csv
@@ -1,4 +1,4 @@
-Trigrams,Frequency
+Trigram,Frequency
 the, 0.03508232
 and, 0.01593878
 ing, 0.01147042