Connect to databaseΒΆ

(BigQuery Natality table I set up)ΒΆ

SELECT year, month, day, wday, state, is_male, gestation_weeks, mother_age, father_age, apgar_1min, apgar_5min, weight_pounds FROM [publicdata:samples.natality] where year between 1980 and 1984

In [1]:
%%sql
#count how many records are in my table
select count(*) 
from [thematic-lore-110201:Natality_1980_1984.Births_1980_1984]
Out[1]:

(rows: 1, time: 0.5s, cached, job: job_c7YxNk5xfob_Y51L83FbX3gtv2w)
In [2]:
#check out the schema of the table
%bigquery schema --table thematic-lore-110201:Natality_1980_1984.Births_1980_1984
Out[2]:

bigquery command + pandas to make this sql into a dataframeΒΆ

In [78]:
%%sql --module full_table
select * from [thematic-lore-110201:Natality_1980_1984.Births_1980_1984]
--it said the results were too large, so filtering down to one day
 where year = 1982 and month = 1 and day = 20
In [79]:
import gcp.bigquery as bq
import pandas as pd
In [80]:
df = bq.Query(full_table).to_dataframe()
len(df)
Out[80]:
9503

Let's look at the data

In [11]:
df.head(10)
Out[11]:
year month day wday state is_male gestation_weeks mother_age father_age apgar_1min apgar_5min weight_pounds
0 1982 1 20 None AR False 45 24 20 8 9 6.750554
1 1982 1 20 None CA True 37 35 38 99 99 7.363440
2 1982 1 20 None CA False 39 22 27 99 99 5.313141
3 1982 1 20 None CA True 38 24 27 99 99 8.249698
4 1982 1 20 None AR False 38 26 30 7 2 6.172943
5 1982 1 20 None CA False 39 25 25 99 99 8.501025
6 1982 1 20 None CT True 40 24 26 9 9 8.313632
7 1982 1 20 None CA False 37 26 26 99 99 7.561856
8 1982 1 20 None CA False 43 24 23 99 99 7.374463
9 1982 1 20 None CA True 40 21 28 99 99 5.813590
In [55]:
#How many males and females born on this day?
groups = df.groupby('is_male')
groups.dtypes
for name, df_group in groups:
  if name == True: 
    sex = 'male' 
  else: 
    sex = 'female'
  print '%d %s babies' % (len(df_group), sex)
  
#how many babies in each state?
groups = df.groupby(['state','is_male'])
table = pd.pivot_table(df, index='state', columns='is_male', aggfunc=len, values='year')
#create a column that is the total count by state, not split by group
table_totals = pd.pivot_table(df, index='state', aggfunc=len, values='year')
table = pd.concat((table,table_totals),axis=1)
#rename the columns
table.columns = ['female','male','total']
#display
groups.size()
table
4635 female babies
4868 male babies
Out[55]:
female male total
state
AK 14 14 28
AL 85 76 161
AR 51 35 86
AZ 39 36 75
CA 292 293 585
CO 80 90 170
CT 62 49 111
DC 14 12 26
DE 3 10 13
FL 190 216 406
GA 43 70 113
HI 26 21 47
IA 67 69 136
ID 25 37 62
IL 240 270 510
IN 100 120 220
KS 52 71 123
KY 69 96 165
LA 128 131 259
MA 113 97 210
MD 53 77 130
ME 23 24 47
MI 149 212 361
MN 99 90 189
MO 114 127 241
MS 57 58 115
MT 16 19 35
NC 138 108 246
ND 8 11 19
NE 33 38 71
NH 24 17 41
NJ 128 140 268
NM 37 40 77
NV 16 20 36
NY 343 334 677
OH 239 256 495
OK 78 98 176
OR 68 70 138
PA 234 220 454
RI 18 19 37
SC 58 82 140
SD 20 15 35
TN 108 100 208
TX 445 437 882
UT 62 80 142
VA 107 117 224
VT 5 8 13
WA 105 102 207
WI 111 91 202
WV 29 35 64
WY 17 10 27

chart births by month and year - line chartΒΆ

In [3]:
%%sql --module birthdata
select year, month, concat(string(month),"/",string(year)) as yearmonth, count(*) as births
from [thematic-lore-110201:Natality_1980_1984.Births_1980_1984]
group by year, month, yearmonth
order by year, month
In [4]:
#put result of query in a dataframe
df_ym = bq.Query(birthdata).to_dataframe()
In [34]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')
plt.figure(); 
df_ym.plot(x='yearmonth',y='births');
<matplotlib.figure.Figure at 0x7fd261889490>

Now we'll recreate the getstation weeks vs birth weight scatterplotΒΆ

In [13]:
%%sql --module gestation_weight
select gestation_weeks, weight_pounds
from [thematic-lore-110201:Natality_1980_1984.Births_1980_1984]
where year = 1982 and month = 1 and day = 20
and gestation_weeks < 99 and gestation_weeks is not null and weight_pounds is not null
--forgot to bring in live baby variable, so using apgar as proxy
and apgar_1min is not null and apgar_1min < 99
In [77]:
import numpy as np
#put result of query in a dataframe
df_gw = bq.Query(gestation_weight).to_dataframe()
df_gw_numeric = df_gw[np.isreal(df_gw).any(axis=1)]
#df_gw.plot(kind='scatter', x='gestation_weeks', y='weight_pounds')
x=df_gw_numeric['gestation_weeks']
y=df_gw_numeric['weight_pounds']
m,b = np.polyfit(x, y, 1)
yline = m*x+b # regression line

df_gw_numeric = pd.concat((df_gw_numeric,yline),axis=1)
df_gw_numeric.columns = ['gestation_weeks','weight_pounds','fit_line']

plt.figure()
df_gw_numeric.plot('gestation_weeks','fit_line','line',zorder=0);
plt.scatter(df_gw_numeric['gestation_weeks'],df_gw_numeric['weight_pounds']);
plt.xlim(10,60)
plt.show()
<matplotlib.figure.Figure at 0x7fd26158ba50>

hm, still not as high of an R^2 as I would have expected! (similar for linear) Lots of variation in the weight even at similar weeks.