In [3]: records =[json.loads(line) for line in open(path)] # line JSON python # records JSON In [16]: len(records) Out[16]: 3560 In [17]: records[0] O

Chapter 1 json 2) path - path='-----.txt' 3) open(path).readline() 4) list records = [json.loads(x) for x in open(path)] 5) records[i]: i+1, records[i]['x']: i 'x' 6) time_zones = [y['tz'] for y in records if 'tz' in y] # y 'tz' tz tz >>if 'tz' in y In [1]: import json path ='2017-07-25.txt' #1.6M EXECUTION/ Chapter 2 page 16 In [2]: #JSON: JavaScript Object Notation WEB ERROR: Line magic function `%json` not found. In [2]: open(path).readline()#json Out[2]: '{ "a": "Mozilla\\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\\/535.11 (KHTML, like Gecko) Chrome\\/17.0.963.78 Safari\\/535.11", "c": "US", "nk": 1, "tz": "America\\/New_York", "gr": "MA", "g": "A6qOVH", "h ": "wflqtf", "l": "orofrog", "al": "en-us,en;q=0.8", "hh": "1.usa.go v", "r": "http:\\/\\/www.facebook.com\\/l\\/7aqefzjsi\\/1.usa.gov\\/ wflqtf", "u": "http:\\/\\/www.ncbi.nlm.nih.gov\\/pubmed\\/22415991", "t": 1331923247, "hc": 1331822918, "cy": "Danvers", "ll": [ 42.57669 8, -70.954903 ] }\n'

In [3]: records =[json.loads(line) for line in open(path)] # line JSON python # records JSON In [16]: len(records) Out[16]: 3560 In [17]: records[0] Out[17]: {'a': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.78 Safari/535.11', 'al': 'en-us,en;q=0.8', 'c': 'US', 'cy': 'Danvers', 'g': 'A6qOVH', 'gr': 'MA', 'h': 'wflqtf', 'hc': 1331822918, 'hh': '1.usa.gov', 'l': 'orofrog', 'll': [42.576698, -70.954903], 'nk': 1, 'r': 'http://www.facebook.com/l/7aqefzjsi/1.usa.gov/wflqtf', 't': 1331923247, 'tz': 'America/New_York', 'u': 'http://www.ncbi.nlm.nih.gov/pubmed/22415991'} JSON Python dict obj JSON module In [4]: records[0]['tz'] #timezone Out[4]: 'America/New_York' In [5]: time_zones = [rec['tz'] for rec in records if 'tz' in rec] # rec

{'': 521, 'Africa/Cairo': 3, In [6]: time_zones[:10]# 'tz' Out[6]: ['America/New_York', 'America/Denver', 'America/New_York', 'America/Sao_Paulo', 'America/New_York', 'America/New_York', 'Europe/Warsaw', '', '', ''] get_counts(list) 1) counts = {} 2) for x in list:, if x in counts: counts[x] +=1, else: counts[x] = 1, return counts else 1 +=1 counts[x] +1 >>>> {'x1':1, 'x2':2,... } In [7]: def get_counts(sequence): counts ={} for x in sequence: if x in counts: counts[x]+=1 else: counts[x] = 1 return counts #tz In [8]: get_counts(time_zones) Out[8]:

'Africa/Casablanca': 1, 'Africa/Ceuta': 2, 'Africa/Johannesburg': 1, 'Africa/Lusaka': 1, 'America/Anchorage': 5, 'America/Argentina/Buenos_Aires': 1, 'America/Argentina/Cordoba': 1, 'America/Argentina/Mendoza': 1, 'America/Bogota': 3, 'America/Caracas': 1, 'America/Chicago': 400, 'America/Chihuahua': 2, 'America/Costa_Rica': 1, 'America/Denver': 191, 'America/Edmonton': 6, 'America/Guayaquil': 2, 'America/Halifax': 4, 'America/Indianapolis': 20, 'America/La_Paz': 1, 'America/Lima': 1, 'America/Los_Angeles': 382, 'America/Managua': 3, 'America/Mazatlan': 1, 'America/Mexico_City': 15, 'America/Monterrey': 1, 'America/Montevideo': 1, 'America/Montreal': 9, 'America/New_York': 1251, 'America/Phoenix': 20, 'America/Puerto_Rico': 10, 'America/Rainy_River': 25, 'America/Recife': 2, 'America/Santo_Domingo': 1, 'America/Sao_Paulo': 33, 'America/St_Kitts': 1, 'America/Tegucigalpa': 1, 'America/Vancouver': 12, 'America/Winnipeg': 4, 'Asia/Amman': 2, 'Asia/Bangkok': 6, 'Asia/Beirut': 4, 'Asia/Calcutta': 9, 'Asia/Dubai': 4, 'Asia/Harbin': 3, 'Asia/Hong_Kong': 10, 'Asia/Istanbul': 9, 'Asia/Jakarta': 3, 'Asia/Jerusalem': 3, 'Asia/Karachi': 3, 'Asia/Kuala_Lumpur': 3, 'Asia/Kuching': 1, 'Asia/Manila': 1, 'Asia/Nicosia': 1, 'Asia/Novosibirsk': 1, 'Asia/Pontianak': 1, 'Asia/Riyadh': 1, 'Asia/Seoul': 5,

'Asia/Tokyo': 37, 'Asia/Yekaterinburg': 1, 'Australia/NSW': 6, 'Australia/Queensland': 1, 'Chile/Continental': 6, 'Europe/Amsterdam': 22, 'Europe/Athens': 6, 'Europe/Belgrade': 2, 'Europe/Berlin': 28, 'Europe/Bratislava': 3, 'Europe/Brussels': 4, 'Europe/Bucharest': 4, 'Europe/Budapest': 5, 'Europe/Copenhagen': 5, 'Europe/Dublin': 3, 'Europe/Helsinki': 10, 'Europe/Lisbon': 8, 'Europe/Ljubljana': 1, 'Europe/London': 74, 'Europe/Madrid': 35, 'Europe/Malta': 2, 'Europe/Moscow': 10, 'Europe/Oslo': 10, 'Europe/Paris': 14, 'Europe/Prague': 10, 'Europe/Riga': 2, 'Europe/Rome': 27, 'Europe/Skopje': 1, 'Europe/Sofia': 1, 'Europe/Stockholm': 14, 'Europe/Uzhgorod': 1, 'Europe/Vienna': 6, 'Europe/Vilnius': 2, 'Europe/Volgograd': 1, 'Europe/Warsaw': 16, 'Europe/Zurich': 4, 'Pacific/Auckland': 11, 'Pacific/Honolulu': 36} In [9]: counts = get_counts(time_zones) In [10]: counts['america/new_york'] #counts 'America/New_York' Out[10]: 1251

In [11]: len(time_zones) Out[11]: 3440 top_counts(count_dict, n=10) (top_counts(dict obj, n=10)) top_counts(tupple list) 10 list.items() list.sort() counts.sort()[-10:] [:10]

In [12]: counts.items() Out[12]: dict_items([('', 521), ('Asia/Istanbul', 9), ('America/Montevideo', 1), ('Chile/Continental', 6), ('Asia/Manila', 1), ('America/Rainy_Ri ver', 25), ('Asia/Novosibirsk', 1), ('Europe/Uzhgorod', 1), ('Pacifi c/honolulu', 36), ('America/Sao_Paulo', 33), ('America/La_Paz', 1), ('Europe/Paris', 14), ('Africa/Ceuta', 2), ('America/Mazatlan', 1), ('Europe/Helsinki', 10), ('Asia/Nicosia', 1), ('Asia/Bangkok', 6), ( 'America/Anchorage', 5), ('Africa/Lusaka', 1), ('Europe/Stockholm', 14), ('America/Mexico_City', 15), ('America/Costa_Rica', 1), ('Europ e/belgrade', 2), ('America/Puerto_Rico', 10), ('America/New_York', 1 251), ('Africa/Casablanca', 1), ('Europe/Madrid', 35), ('America/Chi cago', 400), ('America/Recife', 2), ('Asia/Yekaterinburg', 1), ('Ame rica/guayaquil', 2), ('Africa/Johannesburg', 1), ('America/Denver', 191), ('America/Caracas', 1), ('Europe/Warsaw', 16), ('Europe/Athens ', 6), ('America/St_Kitts', 1), ('America/Argentina/Buenos_Aires', 1 ), ('Europe/Bucharest', 4), ('Europe/Amsterdam', 22), ('Asia/Riyadh', 1), ('America/Tegucigalpa', 1), ('America/Argentina/Cordoba', 1), ('America/Edmonton', 6), ('Europe/Sofia', 1), ('Europe/Moscow', 10), ('Europe/Dublin', 3), ('Europe/Brussels', 4), ('Europe/Malta', 2), ( 'Asia/Calcutta', 9), ('Europe/London', 74), ('Europe/Riga', 2), ('Am erica/phoenix', 20), ('Europe/Zurich', 4), ('Europe/Lisbon', 8), ('A sia/jakarta', 3), ('America/Halifax', 4), ('America/Vancouver', 12), ('Asia/Pontianak', 1), ('Europe/Berlin', 28), ('America/Chihuahua', 2), ('America/Winnipeg', 4), ('Europe/Skopje', 1), ('Asia/Amman', 2), ('Africa/Cairo', 3), ('America/Indianapolis', 20), ('Asia/Seoul', 5), ('Asia/Jerusalem', 3), ('Europe/Budapest', 5), ('Australia/Queen sland', 1), ('Asia/Beirut', 4), ('Asia/Kuala_Lumpur', 3), ('America/ Lima', 1), ('America/Monterrey', 1), ('Europe/Copenhagen', 5), ('Asi a/karachi', 3), ('Europe/Ljubljana', 1), ('Asia/Hong_Kong', 10), ('E urope/volgograd', 1), ('America/Montreal', 9), ('Australia/NSW', 6), ('America/Santo_Domingo', 1), ('Europe/Rome', 27), ('Asia/Tokyo', 37 ), ('Europe/Vienna', 6), ('Asia/Harbin', 3), ('America/Managua', 3), ('Europe/Bratislava', 3), ('Europe/Oslo', 10), ('Asia/Kuching', 1), ('Pacific/Auckland', 11), ('America/Argentina/Mendoza', 1), ('Europe /Vilnius', 2), ('America/Bogota', 3), ('America/Los_Angeles', 382), ('Europe/Prague', 10), ('Asia/Dubai', 4)]) In [13]: def top_counts(count_dict,n=10): value_key_pairs =[(count,tz) for tz, count in count_dict.items()]#(count, tz) value_key_pairs.sort()# return value_key_pairs[-n:]# -10 10 #count count tz (count, tz)

In [14]: top_counts(counts) Out[14]: [(33, 'America/Sao_Paulo'), (35, 'Europe/Madrid'), (36, 'Pacific/Honolulu'), (37, 'Asia/Tokyo'), (74, 'Europe/London'), (191, 'America/Denver'), (382, 'America/Los_Angeles'), (400, 'America/Chicago'), (521, ''), (1251, 'America/New_York')] JSON (NewYork ) NY 2.1.2 pandas pandas In [15]: from pandas import DataFrame, Series In [16]: import pandas as pd import numpy as np In [17]: frame = DataFrame(records)

In [38]: frame Out[38]: _heartbeat_ a al c cy g gr 0 NaN Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi... en-us,en;q=0.8 US Danvers A6qOVH MA 1 NaN GoogleMaps/RochesterNY NaN US Provo mwszks UT 2 NaN 3 NaN 4 NaN In [18]: Mozilla/4.0 (compatible; MSIE 8.0; Windows NT... Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8)... Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi... en-us US Washington xxr3qb DC pt-br BR Braz zcalwp 27 en-us,en;q=0.8 US Shrewsbury 9b6kNl MA frame['tz'][:10] #frame['tz'] (Series) # Series value_counts Out[18]: 0 America/New_York 1 America/Denver 2 America/New_York 3 America/Sao_Paulo 4 America/New_York 5 America/New_York 6 Europe/Warsaw 7 8 9 Name: tz, dtype: object In [23]: tz_counts = frame.tz.value_counts()[:10]#list['x'].value_counts() #tz_counts = frame['tz'].value_counts() pandas DataFrame(records)['tz'].value_counts() [:10] pandas tz

In [25]: tz_counts Out[25]: America/New_York 1251 521 America/Chicago 400 America/Los_Angeles 382 America/Denver 191 Europe/London 74 Asia/Tokyo 37 Pacific/Honolulu 36 Europe/Madrid 35 America/Sao_Paulo 33 Name: tz, dtype: int64 In [26]: clean_tz =frame['tz'].fillna('missing')#timezone missing Na In [27]: clean_tz[clean_tz == '']='Unknown'#timezone Unknown In [28]: tz_counts = clean_tz.value_counts() In [29]: tz_counts[:10] Out[29]: America/New_York 1251 Unknown 521 America/Chicago 400 America/Los_Angeles 382 America/Denver 191 Missing 120 Europe/London 74 Asia/Tokyo 37 Pacific/Honolulu 36 Europe/Madrid 35 Name: tz, dtype: int64

In [30]: #Graph import matplotlib.pyplot as plt %matplotlib inline tz_counts[:10].plot(kind='barh',rot=0) Out[30]: <matplotlib.axes._subplots.axessubplot at 0x117d641d0> tz In [31]: frame['a'][50] Out[31]: 'Mozilla/5.0 (Windows NT 5.1; rv:10.0.2) Gecko/20100101 Firefox/10.0.2' In [32]: results = Series([x.split()[0] for x in frame.a.dropna()]) # x[0] M,G,M,M,M x.split()[0] #x[0] 1 Series[list one parametar]

In [33]: results[:5]#series Out[33]: 0 Mozilla/5.0 1 GoogleMaps/RochesterNY 2 Mozilla/4.0 3 Mozilla/5.0 4 Mozilla/5.0 dtype: object In [34]: results.value_counts()[:8]#value_count sort Out[34]: Mozilla/5.0 2594 Mozilla/4.0 601 GoogleMaps/RochesterNY 121 Opera/9.80 34 TEST_INTERNET_AGENT 24 GoogleProducer 21 Mozilla/6.0 5 BlackBerry8520/5.0.0.681 4 dtype: int64 In [35]: cframe = frame[frame.a.notnull()] # frame 3543 a NaN In [36]: cframe Out[36]: _heartbeat_ a al c cy g gr 0 NaN Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi... en-us,en;q=0.8 US Danvers A6qOVH MA 1 NaN GoogleMaps/RochesterNY NaN US Provo mwszks UT 2 NaN 3 NaN 4 NaN Mozilla/4.0 (compatible; MSIE 8.0; Windows NT... Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8)... Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi... en-us US Washington xxr3qb DC pt-br BR Braz zcalwp 27 en-us,en;q=0.8 US Shrewsbury 9b6kNl MA

In [37]: # Windows operating_system = np.where(cframe['a'].str.contains('windows'),'windows','not Wi In [38]: Series(operating_system[:5])#operating_system array (np) Out[38]: 0 Windows 1 Not Windows 2 Windows 3 Not Windows 4 Windows dtype: object In [39]: by_tz_os = cframe.groupby(['tz',operating_system]) In [44]: agg_counts = by_tz_os.size().fillna(0) In [45]: agg_counts[:10] Out[45]: tz Not Windows 245 Windows 276 Africa/Cairo Windows 3 Africa/Casablanca Windows 1 Africa/Ceuta Windows 2 Africa/Johannesburg Windows 1 Africa/Lusaka Windows 1 America/Anchorage Not Windows 4 Windows 1 America/Argentina/Buenos_Aires Not Windows 1 dtype: int64 In [49]: agg_counts = by_tz_os.size().unstack().fillna(0)#unstack() In [50]: agg_counts Out[50]: Not Windows Windows

tz 245.0 276.0 Africa/Cairo 0.0 3.0 Africa/Casablanca 0.0 1.0 Africa/Ceuta 0.0 2.0 Africa/Johannesburg 0.0 1.0 Africa/Lusaka 0.0 1.0 America/Anchorage 4.0 1.0 America/Argentina/Buenos_Aires 1.0 0.0 America/Argentina/Cordoba 0.0 1.0 America/Argentina/Mendoza 0.0 1.0 America/Bogota 1.0 2.0 America/Caracas 0.0 1.0 America/Chicago 115.0 285.0 America/Chihuahua 1.0 1.0 America/Costa_Rica 0.0 1.0 America/Denver 132.0 59.0 America/Edmonton 2.0 4.0 America/Guayaquil 2.0 0.0 America/Halifax 1.0 3.0 America/Indianapolis 8.0 12.0 America/La_Paz 0.0 1.0 America/Lima 0.0 1.0 America/Los_Angeles 130.0 252.0 America/Managua 0.0 3.0 America/Mazatlan 1.0 0.0 America/Mexico_City 7.0 8.0 America/Monterrey 1.0 0.0 America/Montevideo 0.0 1.0 America/Montreal 3.0 6.0 America/New_York 339.0 912.0......... Europe/Berlin 9.0 19.0

Europe/Bratislava 1.0 2.0 Europe/Brussels 1.0 3.0 Europe/Bucharest 1.0 3.0 Europe/Budapest 0.0 5.0 Europe/Copenhagen 2.0 3.0 Europe/Dublin 1.0 2.0 Europe/Helsinki 2.0 8.0 Europe/Lisbon 1.0 7.0 Europe/Ljubljana 0.0 1.0 Europe/London 43.0 31.0 Europe/Madrid 16.0 19.0 Europe/Malta 0.0 2.0 Europe/Moscow 1.0 9.0 Europe/Oslo 2.0 8.0 Europe/Paris 4.0 10.0 Europe/Prague 3.0 7.0 Europe/Riga 1.0 1.0 Europe/Rome 8.0 19.0 Europe/Skopje 0.0 1.0 Europe/Sofia 0.0 1.0 Europe/Stockholm 2.0 12.0 Europe/Uzhgorod 0.0 1.0 Europe/Vienna 3.0 3.0 Europe/Vilnius 0.0 2.0 Europe/Volgograd 0.0 1.0 Europe/Warsaw 1.0 15.0 Europe/Zurich 4.0 0.0 Pacific/Auckland 3.0 8.0 Pacific/Honolulu 0.0 36.0 97 rows 2 columns In [51]: indexer = agg_counts.sum(0).argsort()

In [54]: indexer[:10] Out[54]: Not Windows 0 Windows 1 dtype: int64 In [58]: indexer = agg_counts.sum(1).argsort() In [59]: indexer[:10] Out[59]: tz 24 Africa/Cairo 20 Africa/Casablanca 21 Africa/Ceuta 92 Africa/Johannesburg 87 Africa/Lusaka 53 America/Anchorage 54 America/Argentina/Buenos_Aires 57 America/Argentina/Cordoba 26 America/Argentina/Mendoza 55 dtype: int64 In [60]: indexer[:50] Out[60]: tz 24 Africa/Cairo 20 Africa/Casablanca 21 Africa/Ceuta 92 Africa/Johannesburg 87 Africa/Lusaka 53 America/Anchorage 54 America/Argentina/Buenos_Aires 57 America/Argentina/Cordoba 26 America/Argentina/Mendoza 55 America/Bogota 62 America/Caracas 34 America/Chicago 60 America/Chihuahua 36 America/Costa_Rica 37 America/Denver 27 America/Edmonton 76 America/Guayaquil 56

America/Halifax 89 America/Indianapolis 2 America/La_Paz 4 America/Lima 5 America/Los_Angeles 7 America/Managua 8 America/Mazatlan 9 America/Mexico_City 86 America/Monterrey 11 America/Montevideo 14 America/Montreal 52 America/New_York 84 America/Phoenix 17 America/Puerto_Rico 91 America/Rainy_River 40 America/Recife 66 America/Santo_Domingo 13 America/Sao_Paulo 33 America/St_Kitts 3 America/Tegucigalpa 79 America/Vancouver 51 America/Winnipeg 45 Asia/Amman 48 Asia/Bangkok 50 Asia/Beirut 23 Asia/Calcutta 73 Asia/Dubai 10 Asia/Harbin 1 Asia/Hong_Kong 68 Asia/Istanbul 49 Asia/Jakarta 69 Asia/Jerusalem 70 dtype: int64 In [61]: count_subset = agg_counts.take(indexer)[-10:]

In [62]: count_subset Out[62]: Not Windows Windows tz America/Sao_Paulo 13.0 20.0 Europe/Madrid 16.0 19.0 Pacific/Honolulu 0.0 36.0 Asia/Tokyo 2.0 35.0 Europe/London 43.0 31.0 America/Denver 132.0 59.0 America/Los_Angeles 130.0 252.0 America/Chicago 115.0 285.0 245.0 276.0 America/New_York 339.0 912.0 In [63]: count_subset.plot(kind='barh',stacked = True) Out[63]: <matplotlib.axes._subplots.axessubplot at 0x11b062630> In [64]: normed_subset = count_subset.div(count_subset.sum(1),axis = 0)

In [65]: normed_subset.plot(kind='barh',stacked = True) Out[65]: <matplotlib.axes._subplots.axessubplot at 0x11b0e2d68> 2.3 p35 In [66]:!head -n 10 yob1880.txt # 10 Mary,F,7065 Anna,F,2604 Emma,F,2003 Elizabeth,F,1939 Minnie,F,1746 Margaret,F,1578 Ida,F,1472 Alice,F,1414 Bertha,F,1320 Sarah,F,1288 In [67]: import pandas as pd In [68]: names1881 = pd.read_csv('names/yob1881.txt',names=['name','sex','births']) # cd EXECUTION/ names In [64]: names1881 Out[64]:

name sex births 0 Mary F 6919 1 Anna F 2698 2 Emma F 2034 3 Elizabeth F 1852 4 Margaret F 1658 5 Minnie F 1653 6 Ida F 1439 7 Annie F 1326 8 Bertha F 1324 9 Alice F 1308 10 Clara F 1242 11 Sarah F 1226 12 Ella F 1148 13 Nellie F 1096 14 Grace F 1089 15 Florence F 1046 16 Martha F 1044 17 Cora F 969 18 Laura F 962 19 Carrie F 958 20 Maude F 923 21 Bessie F 903 22 Mabel F 893 23 Gertrude F 791 24 Ethel F 788 25 Jennie F 784 26 Edith F 778 27 Hattie F 775 28 Mattie F 754 29 Julia F 737............ 1905 Mercer M 5

1906 Monte M 5 1907 Montgomery M 5 1908 Nolan M 5 1909 Okey M 5 1910 Orley M 5 1911 Page M 5 1912 Philo M 5 1913 Primus M 5 1914 Prosper M 5 1915 Pryor M 5 1916 Rene M 5 1917 Robin M 5 1918 Roll M 5 1919 Rolland M 5 1920 Seward M 5 1921 Shannon M 5 1922 Talmage M 5 1923 Urban M 5 1924 Vaughn M 5 1925 Verner M 5 1926 Waverly M 5 1927 Webster M 5 1928 Weldon M 5 1929 Wells M 5 1930 Wiliam M 5 1931 Wilton M 5 1932 Wing M 5 1933 Wood M 5 1934 Wright M 5 1935 rows 3 columns

In [69]: names1881.groupby('sex').births.sum()#sex Out[69]: sex F 91955 M 100748 Name: births, dtype: int64 In [70]: years=range(1880,2011) pieces =[] columns=['name','sex','births'] #'names/yob1881.txt',names=['name','sex','births'],cd EXECUTION/ names # yob.year.txt 1881 'yob%d.txt' % year #years = range(1880,2011) for year in years: path = 'names/yob%d.txt'%year frame = pd.read_csv(path,names=columns) frame['year'] = year pieces.append(frame) #pieces #year names = pd.concat(pieces,ignore_index=true) In [72]: #frame? length 33838 name, sex, briths, year In [67]: #pd.concat ingore_index=true In [71]: total_births = names.groupby('year').births.sum() #pivot_table

In [72]: total_births.tail() Out[72]: year 2006 3946702 2007 3986130 2008 3915955 2009 3801002 2010 3657392 Name: births, dtype: int64 In [73]: import matplotlib.pyplot as plt %matplotlib inline total_births.plot(title='total births by sex and year') plt.show() In [74]: def add_prop(group): births = group.births.astype(float)#integer group['prop'] = births/births.sum() return group names = names.groupby(['year','sex']).apply(add_prop) In [78]: names Out[78]: name sex births year prop

0 Mary F 7065 1880 0.077643 1 Anna F 2604 1880 0.028618 2 Emma F 2003 1880 0.022013 3 Elizabeth F 1939 1880 0.021309 4 Minnie F 1746 1880 0.019188 5 Margaret F 1578 1880 0.017342 6 Ida F 1472 1880 0.016177 7 Alice F 1414 1880 0.015540 8 Bertha F 1320 1880 0.014507 9 Sarah F 1288 1880 0.014155 10 Annie F 1258 1880 0.013825 11 Clara F 1226 1880 0.013474 12 Ella F 1156 1880 0.012704 13 Florence F 1063 1880 0.011682 14 Cora F 1045 1880 0.011484 15 Martha F 1040 1880 0.011429 16 Laura F 1012 1880 0.011122 17 Nellie F 995 1880 0.010935 18 Grace F 982 1880 0.010792 19 Carrie F 949 1880 0.010429 20 Maude F 858 1880 0.009429 21 Mabel F 808 1880 0.008880 22 Bessie F 794 1880 0.008726 23 Jennie F 793 1880 0.008715 24 Gertrude F 787 1880 0.008649 25 Julia F 783 1880 0.008605 26 Hattie F 769 1880 0.008451 27 Edith F 768 1880 0.008440 28 Mattie F 704 1880 0.007737 29 Rose F 700 1880 0.007693.................. 1690754 Zaviyon M 5 2010 0.000003 1690755 Zaybrien M 5 2010 0.000003

1690756 Zayshawn M 5 2010 0.000003 1690757 Zayyan M 5 2010 0.000003 1690758 Zeal M 5 2010 0.000003 1690759 Zealan M 5 2010 0.000003 1690760 Zecharia M 5 2010 0.000003 1690761 Zeferino M 5 2010 0.000003 1690762 Zekariah M 5 2010 0.000003 1690763 Zeki M 5 2010 0.000003 1690764 Zeriah M 5 2010 0.000003 1690765 Zeshan M 5 2010 0.000003 1690766 Zhyier M 5 2010 0.000003 1690767 Zildjian M 5 2010 0.000003 1690768 Zinn M 5 2010 0.000003 1690769 Zishan M 5 2010 0.000003 1690770 Ziven M 5 2010 0.000003 1690771 Zmari M 5 2010 0.000003 1690772 Zoren M 5 2010 0.000003 1690773 Zuhaib M 5 2010 0.000003 1690774 Zyeire M 5 2010 0.000003 1690775 Zygmunt M 5 2010 0.000003 1690776 Zykerion M 5 2010 0.000003 1690777 Zylar M 5 2010 0.000003 1690778 Zylin M 5 2010 0.000003 1690779 Zymaire M 5 2010 0.000003 1690780 Zyonne M 5 2010 0.000003 1690781 Zyquarius M 5 2010 0.000003 1690782 Zyran M 5 2010 0.000003 1690783 Zzyzx M 5 2010 0.000003 1690784 rows 5 columns

In [75]: def get_top1000(group): return group.sort_index(by='births',ascending=false)[:1000] grouped = names.groupby(['year','sex']) top1000 = grouped.apply(get_top1000) /Users/junyamamoto/anaconda/lib/python3.5/site-packages/ipykernel/ main.py:2: FutureWarning: by argument to sort_index is deprecated, pls use.sort_values(by=...) from ipykernel import kernelapp as app In [76]: get_top1000(names) /Users/junyamamoto/anaconda/lib/python3.5/site-packages/ipykernel/ main.py:2: FutureWarning: by argument to sort_index is deprecated, pls use.sort_values(by=...) from ipykernel import kernelapp as app Out[76]: name sex births year prop 431022 Linda F 99651 1947 0.056229 441381 Linda F 96185 1948 0.056657 437125 James M 94601 1947 0.051768 544528 Michael M 92700 1957 0.043008 437126 Robert M 91557 1947 0.050102 451624 Linda F 90952 1949 0.053201 533062 Michael M 90587 1956 0.042870 556106 Michael M 90468 1958 0.042673 447426 James M 88542 1948 0.050495 510710 Michael M 88462 1954 0.043429 521782 Michael M 88285 1955 0.042911 437127 John M 88251 1947 0.048293 427009 James M 87382 1946 0.053855 489013 James M 87060 1952 0.044795 478386 James M 87046 1951 0.046361 591809 Michael M 86898 1961 0.040958 457680 James M 86762 1949 0.049015 489014 Robert M 86566 1952 0.044541 510711 James M 86274 1954 0.042355

510712 Robert M 86232 1954 0.042334 270294 Charles M 29916 1929 0.027833 478387 Robert M 86189 1951 0.045905 521783 David M 86144 1955 0.041871 467978 James M 86094 1950 0.048099 499763 Robert M 86081 1953 0.043723 579699 David M 85917 1960 0.040288 499764 James M 85897 1953 0.043630 447427 Robert M 85426 1948 0.048718 705826 Michael M 85298 1970 0.045889 567796 Michael M 85277 1959 0.039977 691447 Michael M 85216 1969 0.047625.................. 1014919 Robert M 30224 1987 0.016205 652896 Jeffrey M 30197 1966 0.016931 973469 Joseph M 30195 1985 0.016356 560607 Cynthia F 30178 1959 0.014918 993855 John M 30173 1986 0.016403 875708 Joseph M 30165 1980 0.016924 705836 Jeffrey M 30157 1970 0.016224 973470 Andrew M 30145 1985 0.016328 1084380 Joseph M 30112 1990 0.014674 616327 Joseph M 30095 1963 0.014815 604045 Joseph M 30088 1962 0.014546 556117 Joseph M 30081 1958 0.014189 1059927 Robert M 30061 1989 0.015032 437136 Gary M 30012 1947 0.016423 515071 Nancy F 30005 1955 0.015352 801680 Matthew M 29981 1976 0.019107 326348 Charles M 29954 1935 0.028784 678016 Brian M 29941 1968 0.017221 417545 Michael M 29929 1945 0.022232 953650 Jason M 29924 1984 0.016599 467989 Ronald M 29921 1950 0.016716

289586 Charles M 29906 1931 0.028795 973471 Ryan M 29875 1985 0.016182 875709 Daniel M 29866 1980 0.016756 1059928 Joseph M 29862 1989 0.014932 526183 Donna F 29848 1956 0.014869 1109322 Joseph M 29832 1991 0.014775 1059929 John M 29830 1989 0.014916 493281 Karen F 29829 1953 0.015866 1000 rows 5 columns In [77]: top1000 Out[77]: name sex births year prop year sex 0 Mary F 7065 1880 0.077643 1 Anna F 2604 1880 0.028618 2 Emma F 2003 1880 0.022013 3 Elizabeth F 1939 1880 0.021309 4 Minnie F 1746 1880 0.019188 5 Margaret F 1578 1880 0.017342 6 Ida F 1472 1880 0.016177 7 Alice F 1414 1880 0.015540 8 Bertha F 1320 1880 0.014507 9 Sarah F 1288 1880 0.014155 10 Annie F 1258 1880 0.013825 11 Clara F 1226 1880 0.013474 12 Ella F 1156 1880 0.012704 13 Florence F 1063 1880 0.011682 1880 F 14 Cora F 1045 1880 0.011484 15 Martha F 1040 1880 0.011429 16 Laura F 1012 1880 0.011122 17 Nellie F 995 1880 0.010935

18 Grace F 982 1880 0.010792 19 Carrie F 949 1880 0.010429 20 Maude F 858 1880 0.009429 21 Mabel F 808 1880 0.008880 22 Bessie F 794 1880 0.008726 23 Jennie F 793 1880 0.008715 24 Gertrude F 787 1880 0.008649 25 Julia F 783 1880 0.008605 26 Hattie F 769 1880 0.008451 27 Edith F 768 1880 0.008440 28 Mattie F 704 1880 0.007737 29 Rose F 700 1880 0.007693........................ 1677617 Yair M 201 2010 0.000106 1677616 Talan M 201 2010 0.000106 1677614 Keyon M 201 2010 0.000106 1677613 Kael M 201 2010 0.000106 1677618 Demarion M 200 2010 0.000105 1677619 Gibson M 200 2010 0.000105 1677620 Reagan M 200 2010 0.000105 1677621 Cristofer M 199 2010 0.000105 1677622 Daylen M 199 2010 0.000105 1677623 Jordon M 199 2010 0.000105 1677624 Dashawn M 198 2010 0.000104 1677625 Masen M 198 2010 0.000104 1677629 Rowen M 197 2010 0.000104 1677631 Yousef M 197 2010 0.000104 2010 M 1677630 Thaddeus M 197 2010 0.000104 1677628 Kadin M 197 2010 0.000104 1677627 Dillan M 197 2010 0.000104 1677626 Clarence M 197 2010 0.000104 1677634 Slade M 196 2010 0.000103 1677632 Clinton M 196 2010 0.000103 1677633 Sheldon M 196 2010 0.000103

1677636 Keshawn M 195 2010 0.000103 1677637 Menachem M 195 2010 0.000103 1677635 Joziah M 195 2010 0.000103 1677638 Bailey M 194 2010 0.000102 1677639 Camilo M 194 2010 0.000102 1677640 Destin M 194 2010 0.000102 1677641 Jaquan M 194 2010 0.000102 1677642 Jaydan M 194 2010 0.000102 1677645 Maxton M 193 2010 0.000102 261877 rows 5 columns In [78]: boys=top1000[top1000.sex == 'M'] In [79]: girls = top1000[top1000.sex == 'F'] In [80]: total_births = top1000.pivot_table('births', 'year', 'name', aggfunc = sum) In [81]: total_births Out[81]: name Aaden Aaliyah Aarav Aaron Aarush Ab Abagail Abb Abbey Abbie... year 1880 NaN NaN NaN 102.0 NaN NaN NaN NaN NaN 71.0... 1881 NaN NaN NaN 94.0 NaN NaN NaN NaN NaN 81.0... 1882 NaN NaN NaN 85.0 NaN NaN NaN NaN NaN 80.0... 1883 NaN NaN NaN 105.0 NaN NaN NaN NaN NaN 79.0... 1884 NaN NaN NaN 97.0 NaN NaN NaN NaN NaN 98.0... 1885 NaN NaN NaN 88.0 NaN 6.0 NaN NaN NaN 88.0... 1886 NaN NaN NaN 86.0 NaN NaN NaN NaN NaN 84.0... 1887 NaN NaN NaN 78.0 NaN NaN NaN NaN NaN 104.0... 1888 NaN NaN NaN 90.0 NaN NaN NaN NaN NaN 137.0...

1889 NaN NaN NaN 85.0 NaN NaN NaN NaN NaN 107.0... 1890 NaN NaN NaN 96.0 NaN NaN NaN 6.0 NaN 140.0... 1891 NaN NaN NaN 69.0 NaN NaN NaN NaN NaN 124.0... 1892 NaN NaN NaN 95.0 NaN NaN NaN NaN NaN 119.0... 1893 NaN NaN NaN 81.0 NaN NaN NaN NaN NaN 115.0... 1894 NaN NaN NaN 79.0 NaN NaN NaN NaN NaN 118.0... 1895 NaN NaN NaN 94.0 NaN NaN NaN NaN NaN 92.0... 1896 NaN NaN NaN 69.0 NaN NaN NaN NaN NaN 121.0... 1897 NaN NaN NaN 87.0 NaN NaN NaN NaN NaN 97.0... 1898 NaN NaN NaN 89.0 NaN NaN NaN NaN NaN 120.0... 1899 NaN NaN NaN 71.0 NaN NaN NaN NaN NaN 87.0... 1900 NaN NaN NaN 104.0 NaN NaN NaN NaN NaN 112.0... 1901 NaN NaN NaN 80.0 NaN NaN NaN NaN NaN 87.0... 1902 NaN NaN NaN 78.0 NaN NaN NaN NaN NaN 91.0... 1903 NaN NaN NaN 93.0 NaN NaN NaN NaN NaN 91.0... 1904 NaN NaN NaN 117.0 NaN NaN NaN NaN NaN 80.0... 1905 NaN NaN NaN 96.0 NaN NaN NaN NaN NaN 73.0... 1906 NaN NaN NaN 96.0 NaN NaN NaN NaN NaN 72.0... 1907 NaN NaN NaN 130.0 NaN NaN NaN NaN NaN 79.0... 1908 NaN NaN NaN 114.0 NaN NaN NaN NaN NaN 84.0... 1909 NaN NaN NaN 142.0 NaN NaN NaN NaN NaN 57.0....................................... 1981 NaN NaN NaN 14832.0 NaN NaN NaN NaN 383.0 292.0... 1982 NaN NaN NaN 14538.0 NaN NaN NaN NaN 372.0 275.0... 1983 NaN NaN NaN 14627.0 NaN NaN NaN NaN 419.0 223.0... 1984 NaN NaN NaN 13387.0 NaN NaN NaN NaN 357.0 249.0... 1985 NaN NaN NaN 13123.0 NaN NaN NaN NaN 314.0 233.0... 1986 NaN NaN NaN 12685.0 NaN NaN NaN NaN 369.0 228.0... 1987 NaN NaN NaN 12676.0 NaN NaN NaN NaN 327.0 228.0... 1988 NaN NaN NaN 14393.0 NaN NaN NaN NaN 404.0 226.0... 1989 NaN NaN NaN 15312.0 NaN NaN NaN NaN 470.0 265.0... 1990 NaN NaN NaN 14545.0 NaN NaN NaN NaN 507.0 311.0... 1991 NaN NaN NaN 14240.0 NaN NaN NaN NaN 451.0 278.0... 1992 NaN NaN NaN 14494.0 NaN NaN NaN NaN 430.0 260.0...

1993 NaN NaN NaN 13819.0 NaN NaN NaN NaN 503.0 291.0... 1994 NaN 1451.0 NaN 14379.0 NaN NaN NaN NaN 597.0 351.0... 1995 NaN 1254.0 NaN 13277.0 NaN NaN NaN NaN 549.0 351.0... 1996 NaN 831.0 NaN 11956.0 NaN NaN NaN NaN 552.0 349.0... 1997 NaN 1737.0 NaN 11156.0 NaN NaN NaN NaN 645.0 386.0... 1998 NaN 1399.0 NaN 10539.0 NaN NaN NaN NaN 661.0 398.0... 1999 NaN 1088.0 NaN 9846.0 NaN NaN 211.0 NaN 710.0 430.0... 2000 NaN 1494.0 NaN 9548.0 NaN NaN 222.0 NaN 660.0 432.0... 2001 NaN 3351.0 NaN 9529.0 NaN NaN 244.0 NaN 687.0 526.0... 2002 NaN 4775.0 NaN 8993.0 NaN NaN 256.0 NaN 600.0 514.0... 2003 NaN 3670.0 NaN 8851.0 NaN NaN 276.0 NaN 625.0 536.0... 2004 NaN 3482.0 NaN 8381.0 NaN NaN 258.0 NaN 504.0 500.0... 2005 NaN 3452.0 NaN 7796.0 NaN NaN 287.0 NaN 451.0 445.0... 2006 NaN 3737.0 NaN 8279.0 NaN NaN 297.0 NaN 404.0 440.0... 2007 NaN 3941.0 NaN 8914.0 NaN NaN 313.0 NaN 349.0 468.0... 2008 955.0 4028.0 219.0 8511.0 NaN NaN 317.0 NaN 344.0 400.0... 2009 1265.0 4352.0 270.0 7936.0 NaN NaN 296.0 NaN 307.0 369.0... 2010 448.0 4628.0 438.0 7374.0 226.0 NaN 277.0 NaN 295.0 324.0... 131 rows 6868 columns In [82]: subset = total_births[['john','harry','mary','marilyn']]

In [83]: subset.plot() Out[83]: <matplotlib.axes._subplots.axessubplot at 0x11b36d208> In [85]: table = top1000.pivot_table('prop','year','sex',aggfunc=sum) In [87]: table.plot(title='top1000 over total births') Out[87]: <matplotlib.axes._subplots.axessubplot at 0x124abc5c0> In [ ]: