Lab3-4_BA

Berent Aldikacti

09/12/20

In [9]:
import pandas as pd
surveys_df = pd.read_csv("data/surveys.csv")

Challenge 1

In [10]:
a = [1, 2, 3, 4, 5]
a[0]
Out[10]:
1
In [11]:
a[5]
---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-11-4a84d856522b> in <module>
----> 1 a[5]

IndexError: list index out of range
  1. Above returns an error because it does not have a 6th character in the list.
In [ ]:
a[len(a)]

Challenge 2

In [12]:
surveys_df = pd.read_csv("data/surveys.csv")
surveys_df.head()
Out[12]:
record_id month day year plot_id species_id sex hindfoot_length weight
0 1 7 16 1977 2 NL M 32.0 NaN
1 2 7 16 1977 3 NL M 33.0 NaN
2 3 7 16 1977 2 DM F 37.0 NaN
3 4 7 16 1977 7 DM M 36.0 NaN
4 5 7 16 1977 3 DM M 35.0 NaN
In [13]:
surveys_df.iloc[0:3, 1:4]
Out[13]:
month day year
0 7 16 1977
1 7 16 1977
2 7 16 1977
In [14]:
surveys_df.loc[[0, 10], :]
Out[14]:
record_id month day year plot_id species_id sex hindfoot_length weight
0 1 7 16 1977 2 NL M 32.0 NaN
10 11 7 16 1977 5 DS F 53.0 NaN
In [15]:
surveys_df.loc[0, ['species_id', 'plot_id', 'weight']]
Out[15]:
species_id     NL
plot_id         2
weight        NaN
Name: 0, dtype: object
In [16]:
surveys_df.loc[[0, 10, 35549], :]
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-16-b2913a4edc78> in <module>
----> 1 surveys_df.loc[[0, 10, 35549], :]

~/miniconda3/lib/python3.7/site-packages/pandas/core/indexing.py in __getitem__(self, key)
    871                     # AttributeError for IntervalTree get_value
    872                     pass
--> 873             return self._getitem_tuple(key)
    874         else:
    875             # we by definition only have the 0th axis

~/miniconda3/lib/python3.7/site-packages/pandas/core/indexing.py in _getitem_tuple(self, tup)
   1053             return self._multi_take(tup)
   1054 
-> 1055         return self._getitem_tuple_same_dim(tup)
   1056 
   1057     def _get_label(self, label, axis: int):

~/miniconda3/lib/python3.7/site-packages/pandas/core/indexing.py in _getitem_tuple_same_dim(self, tup)
    748                 continue
    749 
--> 750             retval = getattr(retval, self.name)._getitem_axis(key, axis=i)
    751             # We should never have retval.ndim < self.ndim, as that should
    752             #  be handled by the _getitem_lowerdim call above.

~/miniconda3/lib/python3.7/site-packages/pandas/core/indexing.py in _getitem_axis(self, key, axis)
   1097                     raise ValueError("Cannot index with multidimensional key")
   1098 
-> 1099                 return self._getitem_iterable(key, axis=axis)
   1100 
   1101             # nested tuple slicing

~/miniconda3/lib/python3.7/site-packages/pandas/core/indexing.py in _getitem_iterable(self, key, axis)
   1035 
   1036         # A collection of keys
-> 1037         keyarr, indexer = self._get_listlike_indexer(key, axis, raise_missing=False)
   1038         return self.obj._reindex_with_indexers(
   1039             {axis: [keyarr, indexer]}, copy=True, allow_dups=True

~/miniconda3/lib/python3.7/site-packages/pandas/core/indexing.py in _get_listlike_indexer(self, key, axis, raise_missing)
   1252             keyarr, indexer, new_indexer = ax._reindex_non_unique(keyarr)
   1253 
-> 1254         self._validate_read_indexer(keyarr, indexer, axis, raise_missing=raise_missing)
   1255         return keyarr, indexer
   1256 

~/miniconda3/lib/python3.7/site-packages/pandas/core/indexing.py in _validate_read_indexer(self, key, indexer, axis, raise_missing)
   1314                 with option_context("display.max_seq_items", 10, "display.width", 80):
   1315                     raise KeyError(
-> 1316                         "Passing list-likes to .loc or [] with any missing labels "
   1317                         "is no longer supported. "
   1318                         f"The following labels were missing: {not_found}. "

KeyError: "Passing list-likes to .loc or [] with any missing labels is no longer supported. The following labels were missing: Int64Index([35549], dtype='int64'). See https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike"
In [17]:
surveys_df[0:1]
Out[17]:
record_id month day year plot_id species_id sex hindfoot_length weight
0 1 7 16 1977 2 NL M 32.0 NaN
In [18]:
surveys_df[:4]
Out[18]:
record_id month day year plot_id species_id sex hindfoot_length weight
0 1 7 16 1977 2 NL M 32.0 NaN
1 2 7 16 1977 3 NL M 33.0 NaN
2 3 7 16 1977 2 DM F 37.0 NaN
3 4 7 16 1977 7 DM M 36.0 NaN
In [19]:
surveys_df[:-1]
Out[19]:
record_id month day year plot_id species_id sex hindfoot_length weight
0 1 7 16 1977 2 NL M 32.0 NaN
1 2 7 16 1977 3 NL M 33.0 NaN
2 3 7 16 1977 2 DM F 37.0 NaN
3 4 7 16 1977 7 DM M 36.0 NaN
4 5 7 16 1977 3 DM M 35.0 NaN
... ... ... ... ... ... ... ... ... ...
35543 35544 12 31 2002 15 US NaN NaN NaN
35544 35545 12 31 2002 15 AH NaN NaN NaN
35545 35546 12 31 2002 15 AH NaN NaN NaN
35546 35547 12 31 2002 10 RM F 15.0 14.0
35547 35548 12 31 2002 7 DO M 36.0 51.0

35548 rows × 9 columns

In [20]:
surveys_df.iloc[0:4, 1:4]
Out[20]:
month day year
0 7 16 1977
1 7 16 1977
2 7 16 1977
3 7 16 1977
In [21]:
surveys_df.loc[0:4, 1:4]
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-21-aed7dd61eef6> in <module>
----> 1 surveys_df.loc[0:4, 1:4]

~/miniconda3/lib/python3.7/site-packages/pandas/core/indexing.py in __getitem__(self, key)
    871                     # AttributeError for IntervalTree get_value
    872                     pass
--> 873             return self._getitem_tuple(key)
    874         else:
    875             # we by definition only have the 0th axis

~/miniconda3/lib/python3.7/site-packages/pandas/core/indexing.py in _getitem_tuple(self, tup)
   1053             return self._multi_take(tup)
   1054 
-> 1055         return self._getitem_tuple_same_dim(tup)
   1056 
   1057     def _get_label(self, label, axis: int):

~/miniconda3/lib/python3.7/site-packages/pandas/core/indexing.py in _getitem_tuple_same_dim(self, tup)
    748                 continue
    749 
--> 750             retval = getattr(retval, self.name)._getitem_axis(key, axis=i)
    751             # We should never have retval.ndim < self.ndim, as that should
    752             #  be handled by the _getitem_lowerdim call above.

~/miniconda3/lib/python3.7/site-packages/pandas/core/indexing.py in _getitem_axis(self, key, axis)
   1086         if isinstance(key, slice):
   1087             self._validate_key(key, axis)
-> 1088             return self._get_slice_axis(key, axis=axis)
   1089         elif com.is_bool_indexer(key):
   1090             return self._getbool_axis(key, axis=axis)

~/miniconda3/lib/python3.7/site-packages/pandas/core/indexing.py in _get_slice_axis(self, slice_obj, axis)
   1121         labels = obj._get_axis(axis)
   1122         indexer = labels.slice_indexer(
-> 1123             slice_obj.start, slice_obj.stop, slice_obj.step, kind="loc"
   1124         )
   1125 

~/miniconda3/lib/python3.7/site-packages/pandas/core/indexes/base.py in slice_indexer(self, start, end, step, kind)
   4960         slice(1, 3, None)
   4961         """
-> 4962         start_slice, end_slice = self.slice_locs(start, end, step=step, kind=kind)
   4963 
   4964         # return a slice

~/miniconda3/lib/python3.7/site-packages/pandas/core/indexes/base.py in slice_locs(self, start, end, step, kind)
   5161         start_slice = None
   5162         if start is not None:
-> 5163             start_slice = self.get_slice_bound(start, "left", kind)
   5164         if start_slice is None:
   5165             start_slice = 0

~/miniconda3/lib/python3.7/site-packages/pandas/core/indexes/base.py in get_slice_bound(self, label, side, kind)
   5073         # For datetime indices label may be a string that has to be converted
   5074         # to datetime boundary according to its resolution.
-> 5075         label = self._maybe_cast_slice_bound(label, side, kind)
   5076 
   5077         # we need to look up the label

~/miniconda3/lib/python3.7/site-packages/pandas/core/indexes/base.py in _maybe_cast_slice_bound(self, label, side, kind)
   5025         # this is rejected (generally .loc gets you here)
   5026         elif is_integer(label):
-> 5027             self._invalid_indexer("slice", label)
   5028 
   5029         return label

~/miniconda3/lib/python3.7/site-packages/pandas/core/indexes/base.py in _invalid_indexer(self, form, key)
   3262         """
   3263         raise TypeError(
-> 3264             f"cannot do {form} indexing on {type(self).__name__} with these "
   3265             f"indexers [{key}] of type {type(key).__name__}"
   3266         )

TypeError: cannot do slice indexing on Index with these indexers [1] of type int
  1. iloc uses integers to index position but loc uses labels.

Challenge 3 - Quaries

In [22]:
surveys_df.head()
Out[22]:
record_id month day year plot_id species_id sex hindfoot_length weight
0 1 7 16 1977 2 NL M 32.0 NaN
1 2 7 16 1977 3 NL M 33.0 NaN
2 3 7 16 1977 2 DM F 37.0 NaN
3 4 7 16 1977 7 DM M 36.0 NaN
4 5 7 16 1977 3 DM M 35.0 NaN
In [23]:
surveys_df[(surveys_df.year == 1999) & (surveys_df.weight <= 8)]
Out[23]:
record_id month day year plot_id species_id sex hindfoot_length weight
29082 29083 1 16 1999 21 RM M 16.0 8.0
29196 29197 2 20 1999 18 RM M 18.0 8.0
29421 29422 3 15 1999 16 RM M 15.0 8.0
29903 29904 10 10 1999 4 PP M 20.0 7.0
29905 29906 10 10 1999 4 PP M 21.0 4.0
In [24]:
surveys_df[surveys_df['species_id'].isin(['DM','NL'])]
test = ['DM','NL']
surveys_df[surveys_df['species_id'].isin(test)]
Out[24]:
record_id month day year plot_id species_id sex hindfoot_length weight
0 1 7 16 1977 2 NL M 32.0 NaN
1 2 7 16 1977 3 NL M 33.0 NaN
2 3 7 16 1977 2 DM F 37.0 NaN
3 4 7 16 1977 7 DM M 36.0 NaN
4 5 7 16 1977 3 DM M 35.0 NaN
... ... ... ... ... ... ... ... ... ...
35532 35533 12 31 2002 14 DM F 36.0 48.0
35533 35534 12 31 2002 14 DM M 37.0 56.0
35534 35535 12 31 2002 14 DM M 37.0 53.0
35535 35536 12 31 2002 14 DM F 35.0 42.0
35536 35537 12 31 2002 14 DM F 36.0 46.0

11848 rows × 9 columns

In [25]:
surveys_df[(surveys_df.weight > 0) | (surveys_df.weight == 0)]
Out[25]:
record_id month day year plot_id species_id sex hindfoot_length weight
62 63 8 19 1977 3 DM M 35.0 40.0
63 64 8 19 1977 7 DM M 37.0 48.0
64 65 8 19 1977 4 DM F 34.0 29.0
65 66 8 19 1977 4 DM F 35.0 46.0
66 67 8 19 1977 7 DM M 35.0 36.0
... ... ... ... ... ... ... ... ... ...
35540 35541 12 31 2002 15 PB F 24.0 31.0
35541 35542 12 31 2002 15 PB F 26.0 29.0
35542 35543 12 31 2002 15 PB F 27.0 34.0
35546 35547 12 31 2002 10 RM F 15.0 14.0
35547 35548 12 31 2002 7 DO M 36.0 51.0

32283 rows × 9 columns

In [26]:
surveys_df[~(surveys_df.sex == 'M')]
Out[26]:
record_id month day year plot_id species_id sex hindfoot_length weight
2 3 7 16 1977 2 DM F 37.0 NaN
6 7 7 16 1977 2 PE F NaN NaN
8 9 7 16 1977 1 DM F 34.0 NaN
9 10 7 16 1977 6 PF F 20.0 NaN
10 11 7 16 1977 5 DS F 53.0 NaN
... ... ... ... ... ... ... ... ... ...
35543 35544 12 31 2002 15 US NaN NaN NaN
35544 35545 12 31 2002 15 AH NaN NaN NaN
35545 35546 12 31 2002 15 AH NaN NaN NaN
35546 35547 12 31 2002 10 RM F 15.0 14.0
35548 35549 12 31 2002 5 NaN NaN NaN NaN

18201 rows × 9 columns

In [27]:
surveys_df[~(surveys_df.sex == 'F')]
Out[27]:
record_id month day year plot_id species_id sex hindfoot_length weight
0 1 7 16 1977 2 NL M 32.0 NaN
1 2 7 16 1977 3 NL M 33.0 NaN
3 4 7 16 1977 7 DM M 36.0 NaN
4 5 7 16 1977 3 DM M 35.0 NaN
5 6 7 16 1977 1 PF M 14.0 NaN
... ... ... ... ... ... ... ... ... ...
35543 35544 12 31 2002 15 US NaN NaN NaN
35544 35545 12 31 2002 15 AH NaN NaN NaN
35545 35546 12 31 2002 15 AH NaN NaN NaN
35547 35548 12 31 2002 7 DO M 36.0 51.0
35548 35549 12 31 2002 5 NaN NaN NaN NaN

19859 rows × 9 columns

Challenge - Putting it all together

In [28]:
new_df = surveys_df[~(surveys_df.sex == 'F') & ~(surveys_df.sex == 'M')]
new_df.head()
Out[28]:
record_id month day year plot_id species_id sex hindfoot_length weight
13 14 7 16 1977 8 DM NaN NaN NaN
18 19 7 16 1977 4 PF NaN NaN NaN
33 34 7 17 1977 17 DM NaN NaN NaN
56 57 7 18 1977 22 DM NaN NaN NaN
76 77 8 19 1977 4 SS NaN NaN NaN
In [29]:
x = pd.isnull(new_df['sex'])
len(x)
Out[29]:
2511
In [30]:
new_df2 = surveys_df[(surveys_df.sex == 'F') | (surveys_df.sex == 'M') & (surveys_df.weight > 0)]
new_df2
Out[30]:
record_id month day year plot_id species_id sex hindfoot_length weight
2 3 7 16 1977 2 DM F 37.0 NaN
6 7 7 16 1977 2 PE F NaN NaN
8 9 7 16 1977 1 DM F 34.0 NaN
9 10 7 16 1977 6 PF F 20.0 NaN
10 11 7 16 1977 5 DS F 53.0 NaN
... ... ... ... ... ... ... ... ... ...
35540 35541 12 31 2002 15 PB F 24.0 31.0
35541 35542 12 31 2002 15 PB F 26.0 29.0
35542 35543 12 31 2002 15 PB F 27.0 34.0
35546 35547 12 31 2002 10 RM F 15.0 14.0
35547 35548 12 31 2002 7 DO M 36.0 51.0

32569 rows × 9 columns

In [31]:
by_site_sex = new_df2.groupby(['plot_id', 'sex'])
site_sex_count = by_site_sex['weight'].mean()
spc = site_sex_count.unstack()
s_plot = spc.plot(kind='bar', stacked=True, title="Total weight by site and sex")
s_plot.set_ylabel("Weight")
s_plot.set_xlabel("Plot")
Out[31]:
Text(0.5, 0, 'Plot')