Lab3-3_BA

Berent Aldikacti

09/09/20

In [38]:
import pandas as pd
surveys_df = pd.read_csv("data/surveys.csv")

Challenge 1

In [39]:
surveys_df.columns
Out[39]:
Index(['record_id', 'month', 'day', 'year', 'plot_id', 'species_id', 'sex',
       'hindfoot_length', 'weight'],
      dtype='object')
In [40]:
surveys_df.shape
Out[40]:
(35549, 9)
In [41]:
surveys_df.head(15)
Out[41]:
record_id month day year plot_id species_id sex hindfoot_length weight
0 1 7 16 1977 2 NL M 32.0 NaN
1 2 7 16 1977 3 NL M 33.0 NaN
2 3 7 16 1977 2 DM F 37.0 NaN
3 4 7 16 1977 7 DM M 36.0 NaN
4 5 7 16 1977 3 DM M 35.0 NaN
5 6 7 16 1977 1 PF M 14.0 NaN
6 7 7 16 1977 2 PE F NaN NaN
7 8 7 16 1977 1 DM M 37.0 NaN
8 9 7 16 1977 1 DM F 34.0 NaN
9 10 7 16 1977 6 PF F 20.0 NaN
10 11 7 16 1977 5 DS F 53.0 NaN
11 12 7 16 1977 7 DM M 38.0 NaN
12 13 7 16 1977 3 DM M 35.0 NaN
13 14 7 16 1977 8 DM NaN NaN NaN
14 15 7 16 1977 6 DM F 36.0 NaN
In [42]:
surveys_df.tail()
Out[42]:
record_id month day year plot_id species_id sex hindfoot_length weight
35544 35545 12 31 2002 15 AH NaN NaN NaN
35545 35546 12 31 2002 15 AH NaN NaN NaN
35546 35547 12 31 2002 10 RM F 15.0 14.0
35547 35548 12 31 2002 7 DO M 36.0 51.0
35548 35549 12 31 2002 5 NaN NaN NaN NaN

Challenge 2

In [43]:
site_names = pd.unique(surveys_df['plot_id'])
site_names
Out[43]:
array([ 2,  3,  7,  1,  6,  5,  8,  4, 11, 14, 15, 13,  9, 10, 17, 16, 20,
       23, 18, 21, 22, 19, 12, 24])
In [44]:
surveys_df['plot_id'].nunique()
Out[44]:
24
In [45]:
surveys_df['species_id'].nunique()
Out[45]:
48
In [46]:
len(site_names) - surveys_df['plot_id'].nunique()
Out[46]:
0

Challenge 3

In [47]:
grouped_data = surveys_df.groupby('sex')
grouped_data['sex'].count()
Out[47]:
sex
F    15690
M    17348
Name: sex, dtype: int64
In [48]:
grouped_data2 = surveys_df.groupby(['plot_id', 'sex'])
grouped_data2.mean()
Out[48]:
record_id month day year hindfoot_length weight
plot_id sex
1 F 18390.384434 6.597877 15.338443 1990.933962 31.733911 46.311138
M 17197.740639 6.121461 15.905936 1990.091324 34.302770 55.950560
2 F 17714.753608 6.426804 16.288660 1990.449485 30.161220 52.561845
M 18085.458042 6.340035 15.440559 1990.756119 30.353760 51.391382
3 F 19888.783875 6.610302 15.993281 1992.013438 23.774044 31.215349
M 20226.767857 6.277381 16.271429 1992.275000 23.833744 34.163241
4 F 17489.205275 6.447248 15.608945 1990.235092 33.249102 46.818824
M 18493.841748 6.436893 16.303883 1991.000971 34.097959 48.888119
5 F 12280.793169 6.142315 15.722960 1986.485769 28.921844 40.974806
M 12798.426621 6.194539 15.703072 1986.817406 29.694794 40.708551
6 F 19406.503392 6.640434 15.947083 1991.579376 26.981322 36.352288
M 17849.574607 6.043194 15.856021 1990.556283 27.425591 36.867388
7 F 19069.668657 6.385075 15.313433 1991.441791 19.779553 20.006135
M 19188.729642 6.719870 15.778502 1991.462541 20.536667 21.194719
8 F 18920.276190 6.644048 15.479762 1991.267857 32.187578 45.623011
M 19452.109868 6.585961 15.427263 1991.686673 33.751059 49.641372
9 F 16217.497069 6.507620 15.309496 1989.303634 35.126092 53.618469
M 18000.710159 6.369522 14.970120 1990.632470 34.175732 49.519309
10 F 16001.496454 5.588652 16.964539 1989.248227 18.641791 17.094203
M 15708.704225 5.718310 16.739437 1989.007042 19.567164 19.971223
11 F 16994.962287 6.770073 15.954988 1989.836983 32.029299 43.515075
M 16933.909621 6.379981 15.799806 1989.856171 32.078014 43.366197
12 F 17457.966981 6.509434 16.305660 1990.266981 30.975124 49.831731
M 17592.327500 6.304167 16.367500 1990.400833 31.762489 48.909710
13 F 18033.100318 6.815287 15.847134 1990.619427 27.201014 40.524590
M 16969.044700 6.485313 15.851852 1989.911877 27.893793 40.097754
14 F 17097.145275 6.519041 16.427362 1989.974612 32.973373 47.355491
M 17891.948598 6.666355 16.336449 1990.587850 32.961802 45.159378
15 F 20602.449064 6.582121 15.787942 1992.523909 21.949891 26.670236
M 18104.019560 6.193154 17.193154 1990.770171 21.803109 27.523691
16 F 19002.445946 6.369369 16.549550 1991.351351 23.144928 25.810427
M 18434.714286 6.208791 16.402930 1990.926740 23.480916 23.811321
17 F 18234.322870 6.650224 15.892377 1990.785874 30.918536 48.176201
M 18857.651472 6.569801 16.183286 1991.331434 32.227634 47.558853
18 F 17940.875497 6.698013 15.960265 1990.536424 26.690341 36.963514
M 15106.718850 6.610224 16.797125 1988.551118 27.703072 43.546952
19 F 21848.216475 6.701149 15.226054 1993.417625 21.257937 21.978599
M 19470.779690 6.533563 16.647160 1991.740103 21.071685 20.306878
20 F 17510.769231 6.743077 16.026154 1990.253846 27.069193 52.624406
M 16076.192496 6.489396 16.375204 1989.243067 27.908451 44.197279
21 F 22452.636661 6.860884 16.307692 1993.878887 22.366554 25.974832
M 20120.399113 6.671840 16.203991 1992.199557 21.736721 22.772622
22 F 18499.695976 6.651267 15.521610 1990.973174 34.108320 53.647059
M 18015.365527 6.381872 16.682021 1990.650817 33.359746 54.572531
23 F 15863.193939 6.860606 16.036364 1989.024242 20.051948 20.564417
M 17091.338164 6.391304 16.077295 1989.961353 19.850000 18.941463
24 F 13702.224280 6.596708 16.393004 1987.485597 26.993377 47.914405
M 15208.136082 6.360825 16.971134 1988.641237 25.786996 39.321503
In [49]:
grouped_data3 = surveys_df.groupby('plot_id')
grouped_data3['weight'].describe()
Out[49]:
count mean std min 25% 50% 75% max
plot_id
1 1903.0 51.822911 38.176670 4.0 30.0 44.0 53.0 231.0
2 2074.0 52.251688 46.503602 5.0 24.0 41.0 50.0 278.0
3 1710.0 32.654386 35.641630 4.0 14.0 23.0 36.0 250.0
4 1866.0 47.928189 32.886598 4.0 30.0 43.0 50.0 200.0
5 1092.0 40.947802 34.086616 5.0 21.0 37.0 48.0 248.0
6 1463.0 36.738893 30.648310 5.0 18.0 30.0 45.0 243.0
7 638.0 20.663009 21.315325 4.0 11.0 17.0 23.0 235.0
8 1781.0 47.758001 33.192194 5.0 26.0 44.0 51.0 178.0
9 1811.0 51.432358 33.724726 6.0 36.0 45.0 50.0 275.0
10 279.0 18.541219 20.290806 4.0 10.0 12.0 21.0 237.0
11 1793.0 43.451757 28.975514 5.0 26.0 42.0 48.0 212.0
12 2219.0 49.496169 41.630035 6.0 26.0 42.0 50.0 280.0
13 1371.0 40.445660 34.042767 5.0 20.5 33.0 45.0 241.0
14 1728.0 46.277199 27.570389 5.0 36.0 44.0 49.0 222.0
15 869.0 27.042578 35.178142 4.0 11.0 18.0 26.0 259.0
16 480.0 24.585417 17.682334 4.0 12.0 20.0 34.0 158.0
17 1893.0 47.889593 35.802399 4.0 27.0 42.0 50.0 216.0
18 1351.0 40.005922 38.480856 5.0 17.5 30.0 44.0 256.0
19 1084.0 21.105166 13.269840 4.0 11.0 19.0 27.0 139.0
20 1222.0 48.665303 50.111539 5.0 17.0 31.0 47.0 223.0
21 1029.0 24.627794 21.199819 4.0 10.0 22.0 31.0 190.0
22 1298.0 54.146379 38.743967 5.0 29.0 42.0 54.0 212.0
23 369.0 19.634146 18.382678 4.0 10.0 14.0 23.0 199.0
24 960.0 43.679167 45.936588 4.0 19.0 27.5 45.0 251.0

Challenge 4

In [50]:
species_counts = surveys_df.groupby('species_id')['record_id'].count()
print(species_counts)
species_id
AB      303
AH      437
AS        2
BA       46
CB       50
CM       13
CQ       16
CS        1
CT        1
CU        1
CV        1
DM    10596
DO     3027
DS     2504
DX       40
NL     1252
OL     1006
OT     2249
OX       12
PB     2891
PC       39
PE     1299
PF     1597
PG        8
PH       32
PI        9
PL       36
PM      899
PP     3123
PU        5
PX        6
RF       75
RM     2609
RO        8
RX        2
SA       75
SC        1
SF       43
SH      147
SO       43
SS      248
ST        1
SU        5
UL        4
UP        8
UR       10
US        4
ZL        2
Name: record_id, dtype: int64

Challenge 5

In [51]:
grouped_ave = surveys_df.groupby('plot_id')['weight'].mean()
grouped_ave.plot(kind='bar');
In [52]:
grouped_total = surveys_df.groupby('sex').count()
grouped_total.plot(kind='bar');

Summary Plot Challenge

In [53]:
by_site_sex = surveys_df.groupby(['plot_id', 'sex'])
site_sex_count = by_site_sex['weight'].sum()
spc = site_sex_count.unstack()
s_plot = spc.plot(kind='bar', stacked=True, title="Total weight by site and sex")
s_plot.set_ylabel("Weight")
s_plot.set_xlabel("Plot")
Out[53]:
Text(0.5, 0, 'Plot')