Connect to database¶

(BigQuery Natality table I set up)¶

SELECT year, month, day, wday, state, is_male, gestation_weeks, mother_age, father_age, apgar_1min, apgar_5min, weight_pounds FROM [publicdata:samples.natality] where year between 1980 and 1984

%%sql
#count how many records are in my table
select count(*) 
from [thematic-lore-110201:Natality_1980_1984.Births_1980_1984]

#check out the schema of the table
%bigquery schema --table thematic-lore-110201:Natality_1980_1984.Births_1980_1984

bigquery command + pandas to make this sql into a dataframe¶

%%sql --module full_table
select * from [thematic-lore-110201:Natality_1980_1984.Births_1980_1984]
--it said the results were too large, so filtering down to one day
 where year = 1982 and month = 1 and day = 20

import gcp.bigquery as bq
import pandas as pd

df = bq.Query(full_table).to_dataframe()
len(df)

9503

Let's look at the data

df.head(10)

#How many males and females born on this day?
groups = df.groupby('is_male')
groups.dtypes
for name, df_group in groups:
  if name == True: 
    sex = 'male' 
  else: 
    sex = 'female'
  print '%d %s babies' % (len(df_group), sex)
  
#how many babies in each state?
groups = df.groupby(['state','is_male'])
table = pd.pivot_table(df, index='state', columns='is_male', aggfunc=len, values='year')
#create a column that is the total count by state, not split by group
table_totals = pd.pivot_table(df, index='state', aggfunc=len, values='year')
table = pd.concat((table,table_totals),axis=1)
#rename the columns
table.columns = ['female','male','total']
#display
groups.size()
table

4635 female babies
4868 male babies

chart births by month and year - line chart¶

%%sql --module birthdata
select year, month, concat(string(month),"/",string(year)) as yearmonth, count(*) as births
from [thematic-lore-110201:Natality_1980_1984.Births_1980_1984]
group by year, month, yearmonth
order by year, month

#put result of query in a dataframe
df_ym = bq.Query(birthdata).to_dataframe()

import matplotlib.pyplot as plt
plt.style.use('ggplot')
plt.figure(); 
df_ym.plot(x='yearmonth',y='births');

<matplotlib.figure.Figure at 0x7fd261889490>

Now we'll recreate the getstation weeks vs birth weight scatterplot¶

%%sql --module gestation_weight
select gestation_weeks, weight_pounds
from [thematic-lore-110201:Natality_1980_1984.Births_1980_1984]
where year = 1982 and month = 1 and day = 20
and gestation_weeks < 99 and gestation_weeks is not null and weight_pounds is not null
--forgot to bring in live baby variable, so using apgar as proxy
and apgar_1min is not null and apgar_1min < 99

import numpy as np
#put result of query in a dataframe
df_gw = bq.Query(gestation_weight).to_dataframe()
df_gw_numeric = df_gw[np.isreal(df_gw).any(axis=1)]
#df_gw.plot(kind='scatter', x='gestation_weeks', y='weight_pounds')
x=df_gw_numeric['gestation_weeks']
y=df_gw_numeric['weight_pounds']
m,b = np.polyfit(x, y, 1)
yline = m*x+b # regression line

df_gw_numeric = pd.concat((df_gw_numeric,yline),axis=1)
df_gw_numeric.columns = ['gestation_weeks','weight_pounds','fit_line']

plt.figure()
df_gw_numeric.plot('gestation_weeks','fit_line','line',zorder=0);
plt.scatter(df_gw_numeric['gestation_weeks'],df_gw_numeric['weight_pounds']);
plt.xlim(10,60)
plt.show()

<matplotlib.figure.Figure at 0x7fd26158ba50>

hm, still not as high of an R^2 as I would have expected! (similar for linear) Lots of variation in the weight even at similar weeks.

	year	month	day	wday	state	is_male	gestation_weeks	mother_age	father_age	apgar_1min	apgar_5min	weight_pounds
0	1982	1	20	None	AR	False	45	24	20	8	9	6.750554
1	1982	1	20	None	CA	True	37	35	38	99	99	7.363440
2	1982	1	20	None	CA	False	39	22	27	99	99	5.313141
3	1982	1	20	None	CA	True	38	24	27	99	99	8.249698
4	1982	1	20	None	AR	False	38	26	30	7	2	6.172943
5	1982	1	20	None	CA	False	39	25	25	99	99	8.501025
6	1982	1	20	None	CT	True	40	24	26	9	9	8.313632
7	1982	1	20	None	CA	False	37	26	26	99	99	7.561856
8	1982	1	20	None	CA	False	43	24	23	99	99	7.374463
9	1982	1	20	None	CA	True	40	21	28	99	99	5.813590

	female	male	total
state
AK	14	14	28
AL	85	76	161
AR	51	35	86
AZ	39	36	75
CA	292	293	585
CO	80	90	170
CT	62	49	111
DC	14	12	26
DE	3	10	13
FL	190	216	406
GA	43	70	113
HI	26	21	47
IA	67	69	136
ID	25	37	62
IL	240	270	510
IN	100	120	220
KS	52	71	123
KY	69	96	165
LA	128	131	259
MA	113	97	210
MD	53	77	130
ME	23	24	47
MI	149	212	361
MN	99	90	189
MO	114	127	241
MS	57	58	115
MT	16	19	35
NC	138	108	246
ND	8	11	19
NE	33	38	71
NH	24	17	41
NJ	128	140	268
NM	37	40	77
NV	16	20	36
NY	343	334	677
OH	239	256	495
OK	78	98	176
OR	68	70	138
PA	234	220	454
RI	18	19	37
SC	58	82	140
SD	20	15	35
TN	108	100	208
TX	445	437	882
UT	62	80	142
VA	107	117	224
VT	5	8	13
WA	105	102	207
WI	111	91	202
WV	29	35	64
WY	17	10	27