In [61]:
def get_file_lines(fn):
    fileObj = open(fn, 'r')
    fileList = map(
        lambda x: x.strip(),
        filter(
            lambda x: x,
            fileObj.readlines()
        )
    )
    fileObj.close()
    return fileList
In [62]:
book_filenames = get_file_lines('booklist.txt')
stopwords = get_file_lines('stopwords.txt')
In [63]:
import collections
import re
from math import log
In [64]:
def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in xrange(0, len(l), n):
        yield l[i:i+n]
In [65]:
def tokenize(text):
    return filter(
        lambda x: not x in stopwords,
        re.findall(r"\w+", text)
    )

def inv_doc_freq(word, token_chunks):
    count = 0
    for chunk in token_chunks:
        if word in chunk:
            count += 1
    df = count/float(len(token_chunks))
    return log(1/df)

def tf_idf(tokens, token_chunks):
    wordcount = float(len(tokens))
    mc = collections.Counter(tokens).most_common()
    tfidfs = map(
        lambda x: (
            x[0],
            (x[1]/wordcount)*inv_doc_freq(x[0], token_chunks)
        ),
        mc
    )
    return sorted(tfidfs, key=lambda x: -x[1])

def get_freq_chunks(booktext, chunksize):
    grafs = filter(lambda x: x, booktext.split('\n'))
    grafChunks = list(chunks(grafs, chunksize))
    grafChunkStrs = map(lambda x: '\n'.join(x).lower(), grafChunks)
    tokenChunks = map(tokenize, grafChunkStrs)
    return map(lambda x: tf_idf(x, tokenChunks), tokenChunks)
In [66]:
wordFreqDict = dict()

for fn in book_filenames:
    bookObj = open("books/"+fn, 'r')
    bookText = bookObj.read()
    bookObj.close()
    
    bookTitle = fn.rsplit('.', 1)[0]
    wordFreqDict[bookTitle] = get_freq_chunks(bookText, 10)
In [67]:
wordFreqDict.keys()
Out[67]:
['Galapagos',
 'Bluebeard',
 'Breakfast of Champions',
 'Basic Training',
 'Mother Night',
 'Armageddon In Retrospect',
 'Slaughterhouse-Five',
 'Jailbird',
 'Slapstick',
 'Happy Birthday, Wanda June',
 'God Bless You, Dr. Kevorkian',
 'Palm Sunday',
 'We Are What We Pretend to Be',
 "If This Isn't Nice, What Is",
 'Man without a Country, A',
 "Cat's Cradle",
 'While Mortals Sleep',
 'Sirens of Titan, The',
 'Welcome to the Monkey House',
 '2BR02B',
 'Petrified Ants',
 'God Bless You, Mr. Rosewater',
 'Deadeye Dick',
 'Hocus Pocus',
 'Letters',
 'Timequake',
 'Player Piano',
 'Look at the Birdie',
 'Bagombo Snuff Box']
In [68]:
len(wordFreqDict['Sirens of Titan, The'])
Out[68]:
276
In [69]:
import json

with open('tfidf.json', 'w') as outfile:
    json.dump(wordFreqDict, outfile)
In [ ]: