"""This file contains code used in "Think Stats",
by Allen B. Downey, available from greenteapress.com

Copyright 2010 Allen B. Downey
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
"""

"""
Results:

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 117.7817     0.3408 345.562  < 2e-16 ***
first        -2.2472     0.4909  -4.578 4.75e-06 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 23.38 on 9082 degrees of freedom
Multiple R-squared: 0.002302,	Adjusted R-squared: 0.002193 
F-statistic: 20.96 on 1 and 9082 DF,  p-value: 4.754e-06 

## With just first, we confirm that first babies are lighter.

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 109.52288    1.12557  97.304  < 2e-16 ***
ages          0.28773    0.04405   6.531 6.87e-11 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 

Residual standard error: 23.35 on 9082 degrees of freedom
Multiple R-squared: 0.004675,	Adjusted R-squared: 0.004565 
F-statistic: 42.66 on 1 and 9082 DF,  p-value: 6.868e-11 


## With just ages, we confirm that younger women have lighter babies.
## The slope and intercept are the same as in agemodel.py.


Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 111.1561     1.2854  86.474  < 2e-16 ***
first        -1.3599     0.5175  -2.628   0.0086 ** 
ages          0.2485     0.0465   5.345 9.26e-08 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 

Residual standard error: 23.34 on 9081 degrees of freedom
Multiple R-squared: 0.005432,	Adjusted R-squared: 0.005212 
F-statistic:  24.8 on 2 and 9081 DF,  p-value: 1.821e-11 


## With first and ages, the effect of first gets smaller by about 40%


Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 93.226968   4.554625  20.469  < 2e-16 ***
ages         1.598195   0.357644   4.469 7.97e-06 ***
ages2       -0.025098   0.006797  -3.692 0.000224 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 

Residual standard error: 23.33 on 9081 degrees of freedom
Multiple R-squared: 0.006167,	Adjusted R-squared: 0.005948 
F-statistic: 28.18 on 2 and 9081 DF,  p-value: 6.331e-13 

## With ages and ages2 we get a non-linear model that explains
## just a little bit more of the variance.


Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 95.881747   4.719302  20.317  < 2e-16 ***
first       -1.118630   0.522119  -2.142 0.032181 *  
ages         1.460500   0.363303   4.020 5.87e-05 ***
ages2       -0.023078   0.006861  -3.364 0.000772 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 

Residual standard error: 23.33 on 9080 degrees of freedom
Multiple R-squared: 0.006669,	Adjusted R-squared: 0.006341 
F-statistic: 20.32 on 3 and 9080 DF,  p-value: 4.039e-13 

## With all three variables, the effect of first is smaller still,
## and on the border of statistical significance.

## Bottom line: first babies are lighter than other babies;  About
## 50% of this effect is explained by mother's age, but the other
## half looks like it is legit.

Note: this analysis uses the lm (linear model) function provided by
rpy2, which is the Python interface to the R statistical software
system.  You can read about R at  http://rpy.sourceforge.net/

If you are running Ubuntu, you can install R and rpy to by running

sudo apt-get install python-rpy2

"""

import rpy2.robjects as robjects
r = robjects.r

import agemodel


def GetAgeWeightFirst(table):
    """Get sequences of mother's age, birth weight, and first baby flag.

    Args:
        table: Table object

    Returns:
        tuple of sequences (ages, weights, first_bool)
    """
    ages = []
    weights = []
    first_bool = []
    for r in table.records:
        if 'NA' in [r.agepreg, r.totalwgt_oz, r.birthord]:
            continue

        # first is 1.0 for first babies; 0.0 for others
        if r.birthord == 1:
            first = 1.0
        else:
            first = 0.0
        
        ages.append(r.agepreg)
        weights.append(r.totalwgt_oz)
        first_bool.append(first)

    return ages, weights, first_bool


def RunModel(model, print_flag=True):
    """Submits model to r.lm and returns the result."""
    model = r(model)
    res = r.lm(model)
    if print_flag:
        PrintSummary(res)
    return res


def PrintSummary(res):
    """Prints results from r.lm (just the parts we want)."""
    flag = False
    lines = r.summary(res)
    lines = str(lines)

    for line in lines.split('\n'):
        # skip everything until we get to coefficients
        if line.startswith('Coefficients'):
            flag = True
        if flag:
            print line
    print


def main(script, model_number=0):

    model_number = int(model_number)

    # get the data
    pool, firsts, others = agemodel.MakeTables()
    ages, weights, first_bool = GetAgeWeightFirst(pool)
    ages2 = [age**2 for age in ages]

    # put the data into the R environment
    robjects.globalEnv['weights'] = robjects.FloatVector(weights)
    robjects.globalEnv['ages'] = robjects.FloatVector(ages)
    robjects.globalEnv['ages2'] = robjects.FloatVector(ages2)
    robjects.globalEnv['first'] = robjects.FloatVector(first_bool)

    # run the models
    models = ['weights ~ first',
              'weights ~ ages',
              'weights ~ first + ages',
              'weights ~ ages + ages2',
              'weights ~ first + ages + ages2']

    model = models[model_number]
    print model
    RunModel(model)


if __name__ == '__main__':
    import sys
    main(*sys.argv)
