def get_file_lines(fn):
    fileObj = open(fn, 'r')
    fileList = map(
        lambda x: x.strip(),
        filter(
            lambda x: x,
            fileObj.readlines()
        )
    )
    fileObj.close()
    return fileList

book_filenames = get_file_lines('booklist.txt')
stopwords = get_file_lines('stopwords.txt')

import collections
import re
from math import log

def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in xrange(0, len(l), n):
        yield l[i:i+n]

def tokenize(text):
    return filter(
        lambda x: not x in stopwords,
        re.findall(r"\w+", text)
    )

def inv_doc_freq(word, token_chunks):
    count = 0
    for chunk in token_chunks:
        if word in chunk:
            count += 1
    df = count/float(len(token_chunks))
    return log(1/df)

def tf_idf(tokens, token_chunks):
    wordcount = float(len(tokens))
    mc = collections.Counter(tokens).most_common()
    tfidfs = map(
        lambda x: (
            x[0],
            (x[1]/wordcount)*inv_doc_freq(x[0], token_chunks)
        ),
        mc
    )
    return sorted(tfidfs, key=lambda x: -x[1])

def get_freq_chunks(booktext, chunksize):
    grafs = filter(lambda x: x, booktext.split('\n'))
    grafChunks = list(chunks(grafs, chunksize))
    grafChunkStrs = map(lambda x: '\n'.join(x).lower(), grafChunks)
    tokenChunks = map(tokenize, grafChunkStrs)
    return map(lambda x: tf_idf(x, tokenChunks), tokenChunks)

wordFreqDict = dict()

for fn in book_filenames:
    bookObj = open("books/"+fn, 'r')
    bookText = bookObj.read()
    bookObj.close()
    
    bookTitle = fn.rsplit('.', 1)[0]
    wordFreqDict[bookTitle] = get_freq_chunks(bookText, 10)

wordFreqDict.keys()

['Galapagos',
 'Bluebeard',
 'Breakfast of Champions',
 'Basic Training',
 'Mother Night',
 'Armageddon In Retrospect',
 'Slaughterhouse-Five',
 'Jailbird',
 'Slapstick',
 'Happy Birthday, Wanda June',
 'God Bless You, Dr. Kevorkian',
 'Palm Sunday',
 'We Are What We Pretend to Be',
 "If This Isn't Nice, What Is",
 'Man without a Country, A',
 "Cat's Cradle",
 'While Mortals Sleep',
 'Sirens of Titan, The',
 'Welcome to the Monkey House',
 '2BR02B',
 'Petrified Ants',
 'God Bless You, Mr. Rosewater',
 'Deadeye Dick',
 'Hocus Pocus',
 'Letters',
 'Timequake',
 'Player Piano',
 'Look at the Birdie',
 'Bagombo Snuff Box']

len(wordFreqDict['Sirens of Titan, The'])

276

import json

with open('tfidf.json', 'w') as outfile:
    json.dump(wordFreqDict, outfile)