def get_file_lines(fn):
fileObj = open(fn, 'r')
fileList = map(
lambda x: x.strip(),
filter(
lambda x: x,
fileObj.readlines()
)
)
fileObj.close()
return fileList
book_filenames = get_file_lines('booklist.txt')
stopwords = get_file_lines('stopwords.txt')
import collections
import re
from math import log
def chunks(l, n):
"""Yield successive n-sized chunks from l."""
for i in xrange(0, len(l), n):
yield l[i:i+n]
def tokenize(text):
return filter(
lambda x: not x in stopwords,
re.findall(r"\w+", text)
)
def inv_doc_freq(word, token_chunks):
count = 0
for chunk in token_chunks:
if word in chunk:
count += 1
df = count/float(len(token_chunks))
return log(1/df)
def tf_idf(tokens, token_chunks):
wordcount = float(len(tokens))
mc = collections.Counter(tokens).most_common()
tfidfs = map(
lambda x: (
x[0],
(x[1]/wordcount)*inv_doc_freq(x[0], token_chunks)
),
mc
)
return sorted(tfidfs, key=lambda x: -x[1])
def get_freq_chunks(booktext, chunksize):
grafs = filter(lambda x: x, booktext.split('\n'))
grafChunks = list(chunks(grafs, chunksize))
grafChunkStrs = map(lambda x: '\n'.join(x).lower(), grafChunks)
tokenChunks = map(tokenize, grafChunkStrs)
return map(lambda x: tf_idf(x, tokenChunks), tokenChunks)
wordFreqDict = dict()
for fn in book_filenames:
bookObj = open("books/"+fn, 'r')
bookText = bookObj.read()
bookObj.close()
bookTitle = fn.rsplit('.', 1)[0]
wordFreqDict[bookTitle] = get_freq_chunks(bookText, 10)
wordFreqDict.keys()
len(wordFreqDict['Sirens of Titan, The'])
import json
with open('tfidf.json', 'w') as outfile:
json.dump(wordFreqDict, outfile)