import pandas
import seaborn as sns
tips = sns.load_dataset("tips")
tips
total_bill | tip | sex | smoker | day | time | size | |
---|---|---|---|---|---|---|---|
0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 |
3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 |
4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 |
... | ... | ... | ... | ... | ... | ... | ... |
239 | 29.03 | 5.92 | Male | No | Sat | Dinner | 3 |
240 | 27.18 | 2.00 | Female | Yes | Sat | Dinner | 2 |
241 | 22.67 | 2.00 | Male | Yes | Sat | Dinner | 2 |
242 | 17.82 | 1.75 | Male | No | Sat | Dinner | 2 |
243 | 18.78 | 3.00 | Female | No | Thur | Dinner | 2 |
244 rows × 7 columns
tips.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 244 entries, 0 to 243 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 total_bill 244 non-null float64 1 tip 244 non-null float64 2 sex 244 non-null category 3 smoker 244 non-null category 4 day 244 non-null category 5 time 244 non-null category 6 size 244 non-null int64 dtypes: category(4), float64(2), int64(1) memory usage: 7.4 KB
tips["sex"].unique()
['Female', 'Male'] Categories (2, object): ['Male', 'Female']
tips["day"].unique()
['Sun', 'Sat', 'Thur', 'Fri'] Categories (4, object): ['Thur', 'Fri', 'Sat', 'Sun']
tips["time"].unique()
['Dinner', 'Lunch'] Categories (2, object): ['Lunch', 'Dinner']
tips["tip"].mean()
2.9982786885245902
tips["tip"].max()
10.0
tips[tips["smoker"]== "Yes"]
total_bill | tip | sex | smoker | day | time | size | |
---|---|---|---|---|---|---|---|
56 | 38.01 | 3.00 | Male | Yes | Sat | Dinner | 4 |
58 | 11.24 | 1.76 | Male | Yes | Sat | Dinner | 2 |
60 | 20.29 | 3.21 | Male | Yes | Sat | Dinner | 2 |
61 | 13.81 | 2.00 | Male | Yes | Sat | Dinner | 2 |
62 | 11.02 | 1.98 | Male | Yes | Sat | Dinner | 2 |
... | ... | ... | ... | ... | ... | ... | ... |
234 | 15.53 | 3.00 | Male | Yes | Sat | Dinner | 2 |
236 | 12.60 | 1.00 | Male | Yes | Sat | Dinner | 2 |
237 | 32.83 | 1.17 | Male | Yes | Sat | Dinner | 2 |
240 | 27.18 | 2.00 | Female | Yes | Sat | Dinner | 2 |
241 | 22.67 | 2.00 | Male | Yes | Sat | Dinner | 2 |
93 rows × 7 columns
tips.sort_values(by="total_bill", ascending=False)
total_bill | tip | sex | smoker | day | time | size | |
---|---|---|---|---|---|---|---|
170 | 50.81 | 10.00 | Male | Yes | Sat | Dinner | 3 |
212 | 48.33 | 9.00 | Male | No | Sat | Dinner | 4 |
59 | 48.27 | 6.73 | Male | No | Sat | Dinner | 4 |
156 | 48.17 | 5.00 | Male | No | Sun | Dinner | 6 |
182 | 45.35 | 3.50 | Male | Yes | Sun | Dinner | 3 |
... | ... | ... | ... | ... | ... | ... | ... |
149 | 7.51 | 2.00 | Male | No | Thur | Lunch | 2 |
111 | 7.25 | 1.00 | Female | No | Sat | Dinner | 1 |
172 | 7.25 | 5.15 | Male | Yes | Sun | Dinner | 2 |
92 | 5.75 | 1.00 | Female | Yes | Fri | Dinner | 2 |
67 | 3.07 | 1.00 | Female | Yes | Sat | Dinner | 1 |
244 rows × 7 columns
grouped = tips.groupby("sex")
grouped
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7ff30a2a3ee0>
grouped.get_group("Male")
total_bill | tip | sex | smoker | day | time | size | |
---|---|---|---|---|---|---|---|
1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 |
3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 |
5 | 25.29 | 4.71 | Male | No | Sun | Dinner | 4 |
6 | 8.77 | 2.00 | Male | No | Sun | Dinner | 2 |
... | ... | ... | ... | ... | ... | ... | ... |
236 | 12.60 | 1.00 | Male | Yes | Sat | Dinner | 2 |
237 | 32.83 | 1.17 | Male | Yes | Sat | Dinner | 2 |
239 | 29.03 | 5.92 | Male | No | Sat | Dinner | 3 |
241 | 22.67 | 2.00 | Male | Yes | Sat | Dinner | 2 |
242 | 17.82 | 1.75 | Male | No | Sat | Dinner | 2 |
157 rows × 7 columns
for k, df in grouped:
print(k)
display(df.head(3))
print("\n")
Male
total_bill | tip | sex | smoker | day | time | size | |
---|---|---|---|---|---|---|---|
1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 |
3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 |
Female
total_bill | tip | sex | smoker | day | time | size | |
---|---|---|---|---|---|---|---|
0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 |
11 | 35.26 | 5.00 | Female | No | Sun | Dinner | 4 |
grouped.count()
total_bill | tip | smoker | day | time | size | |
---|---|---|---|---|---|---|
sex | ||||||
Male | 157 | 157 | 157 | 157 | 157 | 157 |
Female | 87 | 87 | 87 | 87 | 87 | 87 |
grouped.sum()
total_bill | tip | size | |
---|---|---|---|
sex | |||
Male | 3256.82 | 485.07 | 413 |
Female | 1570.95 | 246.51 | 214 |
grouped["total_bill"].max()
sex Male 50.81 Female 44.30 Name: total_bill, dtype: float64
grouped[["total_bill", "tip"]].mean()
total_bill | tip | |
---|---|---|
sex | ||
Male | 20.744076 | 3.089618 |
Female | 18.056897 | 2.833448 |
def large_amounts(values, min_value=5):
n = 0
for t in values:
if t >= min_value:
n += 1
return n
grouped[["tip", "total_bill"]].agg(large_amounts)
tip | total_bill | |
---|---|---|
sex | ||
Male | 20 | 157 |
Female | 8 | 86 |
grouped[["tip", "total_bill"]].agg({"tip": lambda x: large_amounts(x, min_value=3),
"total_bill": lambda x: large_amounts(x, min_value=15)
})
tip | total_bill | |
---|---|---|
sex | ||
Male | 82 | 113 |
Female | 39 | 51 |
def large_tips(df):
df = df.sort_values(by="tip", ascending=False)
return df.head(3)
big_tips = grouped.apply(large_tips)
big_tips
total_bill | tip | sex | smoker | day | time | size | ||
---|---|---|---|---|---|---|---|---|
sex | ||||||||
Male | 170 | 50.81 | 10.00 | Male | Yes | Sat | Dinner | 3 |
212 | 48.33 | 9.00 | Male | No | Sat | Dinner | 4 | |
23 | 39.42 | 7.58 | Male | No | Sat | Dinner | 4 | |
Female | 214 | 28.17 | 6.50 | Female | Yes | Sat | Dinner | 3 |
52 | 34.81 | 5.20 | Female | No | Sun | Dinner | 4 | |
85 | 34.83 | 5.17 | Female | No | Thur | Lunch | 4 |
tips
total_bill | tip | sex | smoker | day | time | size | |
---|---|---|---|---|---|---|---|
0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 |
3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 |
4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 |
... | ... | ... | ... | ... | ... | ... | ... |
239 | 29.03 | 5.92 | Male | No | Sat | Dinner | 3 |
240 | 27.18 | 2.00 | Female | Yes | Sat | Dinner | 2 |
241 | 22.67 | 2.00 | Male | Yes | Sat | Dinner | 2 |
242 | 17.82 | 1.75 | Male | No | Sat | Dinner | 2 |
243 | 18.78 | 3.00 | Female | No | Thur | Dinner | 2 |
244 rows × 7 columns
means = tips.groupby(by=["day", "time"]).mean()
means
total_bill | tip | size | ||
---|---|---|---|---|
day | time | |||
Thur | Lunch | 17.664754 | 2.767705 | 2.459016 |
Dinner | 18.780000 | 3.000000 | 2.000000 | |
Fri | Lunch | 12.845714 | 2.382857 | 2.000000 |
Dinner | 19.663333 | 2.940000 | 2.166667 | |
Sat | Lunch | NaN | NaN | NaN |
Dinner | 20.441379 | 2.993103 | 2.517241 | |
Sun | Lunch | NaN | NaN | NaN |
Dinner | 21.410000 | 3.255132 | 2.842105 |
tips[(tips["day"]=="Sat") & (tips["time"] == "Lunch") ]
total_bill | tip | sex | smoker | day | time | size |
---|
means.index
MultiIndex([('Thur', 'Lunch'), ('Thur', 'Dinner'), ( 'Fri', 'Lunch'), ( 'Fri', 'Dinner'), ( 'Sat', 'Lunch'), ( 'Sat', 'Dinner'), ( 'Sun', 'Lunch'), ( 'Sun', 'Dinner')], names=['day', 'time'])
means.loc[('Thur', 'Lunch')]
total_bill 17.664754 tip 2.767705 size 2.459016 Name: (Thur, Lunch), dtype: float64
flights = sns.load_dataset("flights")
flights
year | month | passengers | |
---|---|---|---|
0 | 1949 | Jan | 112 |
1 | 1949 | Feb | 118 |
2 | 1949 | Mar | 132 |
3 | 1949 | Apr | 129 |
4 | 1949 | May | 121 |
... | ... | ... | ... |
139 | 1960 | Aug | 606 |
140 | 1960 | Sep | 508 |
141 | 1960 | Oct | 461 |
142 | 1960 | Nov | 390 |
143 | 1960 | Dec | 432 |
144 rows × 3 columns
flights = flights.set_index("year")
flights
month | passengers | |
---|---|---|
year | ||
1949 | Jan | 112 |
1949 | Feb | 118 |
1949 | Mar | 132 |
1949 | Apr | 129 |
1949 | May | 121 |
... | ... | ... |
1960 | Aug | 606 |
1960 | Sep | 508 |
1960 | Oct | 461 |
1960 | Nov | 390 |
1960 | Dec | 432 |
144 rows × 2 columns
flights.loc["1950"]
month | passengers | |
---|---|---|
year | ||
1950 | Jan | 115 |
1950 | Feb | 126 |
1950 | Mar | 141 |
1950 | Apr | 135 |
1950 | May | 125 |
1950 | Jun | 149 |
1950 | Jul | 170 |
1950 | Aug | 170 |
1950 | Sep | 158 |
1950 | Oct | 133 |
1950 | Nov | 114 |
1950 | Dec | 140 |
flights = flights.reset_index()
flights
year | month | passengers | |
---|---|---|---|
0 | 1949 | Jan | 112 |
1 | 1949 | Feb | 118 |
2 | 1949 | Mar | 132 |
3 | 1949 | Apr | 129 |
4 | 1949 | May | 121 |
... | ... | ... | ... |
139 | 1960 | Aug | 606 |
140 | 1960 | Sep | 508 |
141 | 1960 | Oct | 461 |
142 | 1960 | Nov | 390 |
143 | 1960 | Dec | 432 |
144 rows × 3 columns
flights = flights.set_index(["year", "month"])
flights
passengers | ||
---|---|---|
year | month | |
1949 | Jan | 112 |
Feb | 118 | |
Mar | 132 | |
Apr | 129 | |
May | 121 | |
... | ... | ... |
1960 | Aug | 606 |
Sep | 508 | |
Oct | 461 | |
Nov | 390 | |
Dec | 432 |
144 rows × 1 columns
flights.index
MultiIndex([(1949, 'Jan'), (1949, 'Feb'), (1949, 'Mar'), (1949, 'Apr'), (1949, 'May'), (1949, 'Jun'), (1949, 'Jul'), (1949, 'Aug'), (1949, 'Sep'), (1949, 'Oct'), ... (1960, 'Mar'), (1960, 'Apr'), (1960, 'May'), (1960, 'Jun'), (1960, 'Jul'), (1960, 'Aug'), (1960, 'Sep'), (1960, 'Oct'), (1960, 'Nov'), (1960, 'Dec')], names=['year', 'month'], length=144)
flights.loc[(1960, 'Mar')]
passengers 419 Name: (1960, Mar), dtype: int64
flights = flights.unstack()
flights
passengers | ||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|
month | Jan | Feb | Mar | Apr | May | Jun | Jul | Aug | Sep | Oct | Nov | Dec |
year | ||||||||||||
1949 | 112 | 118 | 132 | 129 | 121 | 135 | 148 | 148 | 136 | 119 | 104 | 118 |
1950 | 115 | 126 | 141 | 135 | 125 | 149 | 170 | 170 | 158 | 133 | 114 | 140 |
1951 | 145 | 150 | 178 | 163 | 172 | 178 | 199 | 199 | 184 | 162 | 146 | 166 |
1952 | 171 | 180 | 193 | 181 | 183 | 218 | 230 | 242 | 209 | 191 | 172 | 194 |
1953 | 196 | 196 | 236 | 235 | 229 | 243 | 264 | 272 | 237 | 211 | 180 | 201 |
1954 | 204 | 188 | 235 | 227 | 234 | 264 | 302 | 293 | 259 | 229 | 203 | 229 |
1955 | 242 | 233 | 267 | 269 | 270 | 315 | 364 | 347 | 312 | 274 | 237 | 278 |
1956 | 284 | 277 | 317 | 313 | 318 | 374 | 413 | 405 | 355 | 306 | 271 | 306 |
1957 | 315 | 301 | 356 | 348 | 355 | 422 | 465 | 467 | 404 | 347 | 305 | 336 |
1958 | 340 | 318 | 362 | 348 | 363 | 435 | 491 | 505 | 404 | 359 | 310 | 337 |
1959 | 360 | 342 | 406 | 396 | 420 | 472 | 548 | 559 | 463 | 407 | 362 | 405 |
1960 | 417 | 391 | 419 | 461 | 472 | 535 | 622 | 606 | 508 | 461 | 390 | 432 |
flights["Jan"]
--------------------------------------------------------------------------- KeyError Traceback (most recent call last) ~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance) 3360 try: -> 3361 return self._engine.get_loc(casted_key) 3362 except KeyError as err: ~/opt/anaconda3/lib/python3.8/site-packages/pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc() ~/opt/anaconda3/lib/python3.8/site-packages/pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc() pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item() pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item() KeyError: 'Jan' The above exception was the direct cause of the following exception: KeyError Traceback (most recent call last) /var/folders/vd/9gpvwb493r52y4sgtl_fvtvm0000gn/T/ipykernel_12352/3691583124.py in <module> ----> 1 flights["Jan"] ~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/frame.py in __getitem__(self, key) 3455 if is_single_key: 3456 if self.columns.nlevels > 1: -> 3457 return self._getitem_multilevel(key) 3458 indexer = self.columns.get_loc(key) 3459 if is_integer(indexer): ~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/frame.py in _getitem_multilevel(self, key) 3506 def _getitem_multilevel(self, key): 3507 # self.columns is a MultiIndex -> 3508 loc = self.columns.get_loc(key) 3509 if isinstance(loc, (slice, np.ndarray)): 3510 new_columns = self.columns[loc] ~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/indexes/multi.py in get_loc(self, key, method) 2920 2921 if not isinstance(key, tuple): -> 2922 loc = self._get_level_indexer(key, level=0) 2923 return _maybe_to_slice(loc) 2924 ~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/indexes/multi.py in _get_level_indexer(self, key, level, indexer) 3202 else: 3203 -> 3204 idx = self._get_loc_single_level_index(level_index, key) 3205 3206 if level > 0 or self._lexsort_depth == 0: ~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/indexes/multi.py in _get_loc_single_level_index(self, level_index, key) 2853 return -1 2854 else: -> 2855 return level_index.get_loc(key) 2856 2857 def get_loc(self, key, method=None): ~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance) 3361 return self._engine.get_loc(casted_key) 3362 except KeyError as err: -> 3363 raise KeyError(key) from err 3364 3365 if is_scalar(key) and isna(key) and not self.hasnans: KeyError: 'Jan'
flights.columns
MultiIndex([('passengers', 'Jan'), ('passengers', 'Feb'), ('passengers', 'Mar'), ('passengers', 'Apr'), ('passengers', 'May'), ('passengers', 'Jun'), ('passengers', 'Jul'), ('passengers', 'Aug'), ('passengers', 'Sep'), ('passengers', 'Oct'), ('passengers', 'Nov'), ('passengers', 'Dec')], names=[None, 'month'])
flights[('passengers', 'Jan')]
year 1949 112 1950 115 1951 145 1952 171 1953 196 1954 204 1955 242 1956 284 1957 315 1958 340 1959 360 1960 417 Name: (passengers, Jan), dtype: int64
flights = flights.droplevel(0, axis=1)
flights
month | Jan | Feb | Mar | Apr | May | Jun | Jul | Aug | Sep | Oct | Nov | Dec |
---|---|---|---|---|---|---|---|---|---|---|---|---|
year | ||||||||||||
1949 | 112 | 118 | 132 | 129 | 121 | 135 | 148 | 148 | 136 | 119 | 104 | 118 |
1950 | 115 | 126 | 141 | 135 | 125 | 149 | 170 | 170 | 158 | 133 | 114 | 140 |
1951 | 145 | 150 | 178 | 163 | 172 | 178 | 199 | 199 | 184 | 162 | 146 | 166 |
1952 | 171 | 180 | 193 | 181 | 183 | 218 | 230 | 242 | 209 | 191 | 172 | 194 |
1953 | 196 | 196 | 236 | 235 | 229 | 243 | 264 | 272 | 237 | 211 | 180 | 201 |
1954 | 204 | 188 | 235 | 227 | 234 | 264 | 302 | 293 | 259 | 229 | 203 | 229 |
1955 | 242 | 233 | 267 | 269 | 270 | 315 | 364 | 347 | 312 | 274 | 237 | 278 |
1956 | 284 | 277 | 317 | 313 | 318 | 374 | 413 | 405 | 355 | 306 | 271 | 306 |
1957 | 315 | 301 | 356 | 348 | 355 | 422 | 465 | 467 | 404 | 347 | 305 | 336 |
1958 | 340 | 318 | 362 | 348 | 363 | 435 | 491 | 505 | 404 | 359 | 310 | 337 |
1959 | 360 | 342 | 406 | 396 | 420 | 472 | 548 | 559 | 463 | 407 | 362 | 405 |
1960 | 417 | 391 | 419 | 461 | 472 | 535 | 622 | 606 | 508 | 461 | 390 | 432 |
flights.columns
CategoricalIndex(['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'], categories=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', ...], ordered=False, dtype='category', name='month')
flights["Jan"]
year 1949 112 1950 115 1951 145 1952 171 1953 196 1954 204 1955 242 1956 284 1957 315 1958 340 1959 360 1960 417 Name: Jan, dtype: int64
flights
month | Jan | Feb | Mar | Apr | May | Jun | Jul | Aug | Sep | Oct | Nov | Dec |
---|---|---|---|---|---|---|---|---|---|---|---|---|
year | ||||||||||||
1949 | 112 | 118 | 132 | 129 | 121 | 135 | 148 | 148 | 136 | 119 | 104 | 118 |
1950 | 115 | 126 | 141 | 135 | 125 | 149 | 170 | 170 | 158 | 133 | 114 | 140 |
1951 | 145 | 150 | 178 | 163 | 172 | 178 | 199 | 199 | 184 | 162 | 146 | 166 |
1952 | 171 | 180 | 193 | 181 | 183 | 218 | 230 | 242 | 209 | 191 | 172 | 194 |
1953 | 196 | 196 | 236 | 235 | 229 | 243 | 264 | 272 | 237 | 211 | 180 | 201 |
1954 | 204 | 188 | 235 | 227 | 234 | 264 | 302 | 293 | 259 | 229 | 203 | 229 |
1955 | 242 | 233 | 267 | 269 | 270 | 315 | 364 | 347 | 312 | 274 | 237 | 278 |
1956 | 284 | 277 | 317 | 313 | 318 | 374 | 413 | 405 | 355 | 306 | 271 | 306 |
1957 | 315 | 301 | 356 | 348 | 355 | 422 | 465 | 467 | 404 | 347 | 305 | 336 |
1958 | 340 | 318 | 362 | 348 | 363 | 435 | 491 | 505 | 404 | 359 | 310 | 337 |
1959 | 360 | 342 | 406 | 396 | 420 | 472 | 548 | 559 | 463 | 407 | 362 | 405 |
1960 | 417 | 391 | 419 | 461 | 472 | 535 | 622 | 606 | 508 | 461 | 390 | 432 |
flights = flights.stack()
flights
year month 1949 Jan 112 Feb 118 Mar 132 Apr 129 May 121 ... 1960 Aug 606 Sep 508 Oct 461 Nov 390 Dec 432 Length: 144, dtype: int64