''' Allows scoring of text using n-gram probabilities 17/07/12 ''' from math import log10
classngram_score(object): def__init__(self,ngramfile,sep=' '): ''' load a file containing ngrams and counts, calculate log probabilities ''' self.ngrams = {} for line inopen(ngramfile): key,count = line.split(sep) self.ngrams[key] = int(count) self.L = len(key) self.N = sum(self.ngrams.values()) #calculate log probabilities for key in self.ngrams.keys(): self.ngrams[key] = log10(float(self.ngrams[key])/self.N) self.floor = log10(0.01/self.N)
defscore(self,text): ''' compute the score of text ''' score = 0 ngrams = self.ngrams.__getitem__ for i inrange(len(text)-self.L+1): if text[i:i+self.L] in self.ngrams: score += ngrams(text[i:i+self.L]) else: score += self.floor return score
# keep a list of the N best things we have seen, discard anything else classnbest(object): def__init__(self,N=1000): self.store = [] self.N = N defadd(self,item): self.store.append(item) self.store.sort(reverse=True) self.store = self.store[:self.N] def__getitem__(self,k): return self.store[k]
def__len__(self): returnlen(self.store)
#init N=100 for KLEN inrange(3,20): rec = nbest(N)
for i in permutations('ABCDEFGHIJKLMNOPQRSTUVWXYZ',3): key = ''.join(i) + 'A'*(KLEN-len(i)) pt = Autokey(key).decipher(ctext) score = 0 for j inrange(0,len(ctext),KLEN): score += trigram.score(pt[j:j+3]) rec.add((score,''.join(i),pt[:30]))
next_rec = nbest(N) for i inrange(0,KLEN-3): for k inrange(N): for c in'ABCDEFGHIJKLMNOPQRSTUVWXYZ': key = rec[k][1] + c fullkey = key + 'A'*(KLEN-len(key)) pt = Autokey(fullkey).decipher(ctext) score = 0 for j inrange(0,len(ctext),KLEN): score += qgram.score(pt[j:j+len(key)]) next_rec.add((score,key,pt[:30])) rec = next_rec next_rec = nbest(N) bestkey = rec[0][1] pt = Autokey(bestkey).decipher(ctext) bestscore = qgram.score(pt) for i inrange(N): pt = Autokey(rec[i][1]).decipher(ctext) score = qgram.score(pt) if score > bestscore: bestkey = rec[i][1] bestscore = score print(bestscore,'autokey, klen',KLEN,':"'+bestkey+'",',Autokey(bestkey).decipher(ctext))