"""This file contains code for use with "Think Stats", by Allen B. Downey, available from greenteapress.com Copyright 2010 Allen B. Downey License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html """ """ Results: on a log-log scale the tail of the CCDF is a straight line, which suggests that the Pareto distribution is a good model for this data, at least for people with taxable income above the median. """ import csv import sys import myplot import Pmf import Cdf def ReadIncomeFile(filename='08in11si.csv'): """Reads a data file from the IRS and returns the first two columns. Skips the header and returns only the first table (non-cumulative). Args: filename: string data file Returns: list of string pairs """ reader = csv.reader(open(filename)) for line in reader: if line[0] == 'All returns': break t = [] for line in reader: if line[0].startswith('Accumulated'): break t.append(line[0:2]) return t def MakeIncomeDist(data): """Converts the strings from the IRS file to a Hist, Pmf and Cdf. Args: data: list of (dollar range, number of returns) string pairs Returns: tuple of (Hist, Pmf, Cdf) representing the number of returns in each bin """ def clean(s): """Converts dollar amounts to integers.""" try: return int(s.lstrip('$')) except ValueError: if s in ['No', 'income']: return 0 if s == 'more': return -1 return None def midpoint(low, high): """Finds the midpoint of a range.""" if high == -1: return low * 3 / 2 else: return (low + high) / 2 hist = Pmf.Hist() for column, number in data: # convert the number of returns number = number.replace(',', '') number = int(number) # convert the income range column = column.replace(',', '') t = column.split() low, high = t[0], t[-1] low, high = clean(low), clean(high) # add to the histogram x = midpoint(low, high) hist.Incr(x, number) print x, number pmf = Pmf.MakePmfFromHist(hist) cdf = Cdf.MakeCdfFromDict(pmf.GetDict()) return hist, pmf, cdf def main(script, *args): data = ReadIncomeFile() hist, pmf, cdf = MakeIncomeDist(data) # plot the CDF on a log-x scale myplot.Cdf(cdf, root='income_logx', xscale='log') # plot the complementary CDF on a log-log scale myplot.Cdf(cdf, root='income_loglog', complement=True, xscale='log', yscale='log', show=True) if __name__ == "__main__": main(*sys.argv)