import glob
import pandas as pd
import numpy as np
def make_date(x):
"""
Makes a datetime object out of the Date and Time columns
"""
return pd.to_datetime(x['Date'] + " " + x['Time'], format="%Y/%m/%d %I:%M %p")
def compute_precip(x):
"""
Returns for each entry the amount of precipitation that has accumulated
in the previous five minutes, inserting NA for any entry for which either:
- the difference in accumulated precipitation is negative, or
- the previous entry was not five minutes ago.
"""
dt = x["Date"].diff().dt.seconds
dp = np.maximum(0, x['Precip_Accum_mm'].diff()).mask(dt != 300, pd.NA)
return dp
def read_weather_files(ddir):
"""
Reads in all CSV files in the directory `ddir`, and returns a concatenated
data frame. For each file, assumes that file names are of the form
"something_CODE.csv"; and inserts "CODE into the "code" column of the result
for that file.
"""
wfiles = glob.glob(ddir + "/" + "*.csv")
assert len(wfiles) > 0, "No files found."
xl = []
for f in wfiles:
x = pd.read_csv(f).convert_dtypes()
x['Date'] = make_date(x)
x['code'] = f.split("/")[-1].split("_")[0] ## change "/" to "\\" on windows
x['Precip_Amount_mm'] = compute_precip(x)
xl.append(x)
return pd.concat(xl)Groupwise operations: split/apply/combine
Goals
You have used “split/apply/combine” operations, e.g., pd.DataFrame.aggregate. Our goal here is to think about best practices/workflows and get some practice with different applications, on a concrete dataset.
The data: Weather Underground
We looked at the “personal weather station” datat from Weather Underground in a previous class: Weather Data.
Here’s the data:
- a table of local stations: stations.csv
- a zip file of data: weather_data (unzip to
data/)
Parsing code
From your homework, here’s code to read in the files:
Let’s have a look!
import plotnine as p9
stations = pd.read_csv("data/stations.csv")
stations| name | code | lat | lon | elev | |
|---|---|---|---|---|---|
| 0 | BETHEL | KOREUGEN31 | 44.080 | 123.200 | 380 |
| 1 | Torrington Station | KOREUGEN67 | 44.120 | 123.150 | 377 |
| 2 | South Eugene | KOREUGEN74 | 44.030 | 123.110 | 440 |
| 3 | South University | KOREUGEN127 | 44.037 | 123.074 | 470 |
| 4 | Old Baldy | KOREUGEN225 | 44.000 | 123.070 | 800 |
| 5 | Lafferty Park | KOREUGEN226 | 44.020 | 123.100 | 640 |
| 6 | Crescent Ave | KOREUGEN249 | 44.090 | 123.080 | 410 |
| 7 | Alvadore | KOREUGEN260 | 44.100 | 123.270 | 400 |
| 8 | Lema | KOREUGEN295 | 44.080 | 123.130 | 410 |
| 9 | College Hill | KOREUGEN303 | 44.030 | 123.100 | 587 |
| 10 | Churchill neighborhood | KOREUGEN307 | 44.030 | 123.150 | 545 |
| 11 | Stormy | KOREUGEN315 | 44.110 | 123.150 | 387 |
| 12 | Vickers_Whiteaker | KOREUGEN319 | 44.060 | 123.110 | 423 |
| 13 | CrossFit | KOREUGEN321 | 44.051 | 123.085 | 454 |
| 14 | Calliope Corner | KOREUGEN343 | 44.090 | 123.140 | 394 |
| 15 | Stormy | KOREUGEN315 | 44.110 | 123.150 | 387 |
| 16 | Andúril | KOREUGEN333 | 44.080 | 123.200 | 374 |
| 17 | Friends of Buford Park Native Plant Nursery | KOREUGEN346 | 44.020 | 122.990 | 463 |
| 18 | Home | KOREUGEN357 | 44.090 | 123.170 | 384 |
| 19 | Kitselman Eugene | KOREUGEN372 | 44.020 | 123.140 | 984 |
| 20 | Spyglass Osprey PWS | KOREUGEN406 | 44.080 | 123.090 | 417 |
| 21 | Spencer Gardens | KOREUGEN411 | 44.010 | 123.080 | 581 |
| 22 | Powell Street | KOREUGEN432 | 44.020 | 123.110 | 856 |
| 23 | EugeneHillside | KOREUGEN453 | 44.018 | 123.068 | 817 |
| 24 | Ambient WS | KOREUGEN463 | 44.023 | 123.083 | 135 |
| 25 | DragonFire Station | KOREUGEN472 | 44.030 | 123.130 | 728 |
| 26 | Eugene Palace | KOREUGEN507 | 44.100 | 123.160 | 117 |
| 27 | Charla | KOREUGEN514 | 44.039 | 123.054 | 469 |
| 28 | Neo | KOREUGEN516 | 44.020 | 123.150 | 990 |
| 29 | Rooftop | KOREUGEN540 | 44.020 | 123.060 | 902 |
| 30 | River Heights 1 | KORSPRIN69 | 44.070 | 122.970 | 597 |
| 31 | Putman Weathet | KORSPRIN96 | 44.080 | 123.030 | 446 |
| 32 | PEF station | KORSPRIN138 | 44.070 | 123.000 | 472 |
| 33 | Base West Kelly Butte | KORSPRIN144 | 44.050 | 123.040 | 443 |
| 34 | Kacy's Station | KORSPRIN145 | 44.030 | 123.020 | 463 |
| 35 | PWS01 | KORSPRIN172 | 44.030 | 122.980 | 489 |
| 36 | McKenzie View | KORSPRIN176 | 44.090 | 122.970 | 483 |
| 37 | Washburne | KORSPRIN194 | 44.050 | 123.020 | 460 |
| 38 | FRS541 | KORVENET18 | 44.080 | 123.310 | 381 |
weather = read_weather_files("data/weather_data")
weather| Date | Time | Temperature_C | Dew_Point_C | Humidity_% | Wind | Speed_kmh | Gust_kmh | Pressure_hPa | Precip_Rate_mm | Precip_Accum_mm | UV | Solar_w/m2 | code | Precip_Amount_mm | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2015-12-28 17:39:00 | 05:39 PM | 5.61 | 5.28 | 98 | SSW | 0.0 | 0.0 | 1013.88 | 0.0 | 0.0 | 0 | 0.0 | KOREUGEN74 | <NA> |
| 1 | 2015-12-28 17:42:00 | 05:42 PM | 5.61 | 5.5 | 99 | WSW | 0.0 | 0.0 | 1013.88 | 0.0 | 0.0 | 0 | 0.0 | KOREUGEN74 | <NA> |
| 2 | 2015-12-28 17:47:00 | 05:47 PM | 5.72 | 5.61 | 99 | West | 3.22 | 11.91 | 1013.88 | 0.0 | 0.0 | 0 | 0.0 | KOREUGEN74 | 0.0 |
| 3 | 2015-12-28 17:52:00 | 05:52 PM | 5.72 | 5.61 | 99 | West | 0.0 | 0.0 | 1013.88 | 0.0 | 0.0 | 0 | 0.0 | KOREUGEN74 | 0.0 |
| 4 | 2015-12-28 17:57:00 | 05:57 PM | 5.72 | 5.61 | 99 | SSE | 0.0 | 0.0 | 1013.55 | 0.0 | 0.0 | 0 | 0.0 | KOREUGEN74 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 98536 | 2025-12-31 23:39:00 | 11:39 PM | 2.0 | 1.78 | 99 | NNW | 0.0 | 0.0 | 1013.55 | 0.0 | 0.0 | 0 | 0.0 | KOREUGEN127 | 0.0 |
| 98537 | 2025-12-31 23:44:00 | 11:44 PM | 2.0 | 1.78 | 99 | NNW | 0.32 | 0.48 | 1013.21 | 0.0 | 0.0 | 0 | 0.0 | KOREUGEN127 | 0.0 |
| 98538 | 2025-12-31 23:49:00 | 11:49 PM | 2.0 | 1.78 | 99 | NNW | 0.0 | 0.0 | 1013.21 | 0.0 | 0.0 | 0 | 0.0 | KOREUGEN127 | 0.0 |
| 98539 | 2025-12-31 23:54:00 | 11:54 PM | 2.11 | 1.89 | 99 | NNW | 0.0 | 0.0 | 1013.21 | 0.0 | 0.0 | 0 | 0.0 | KOREUGEN127 | 0.0 |
| 98540 | 2025-12-31 23:59:00 | 11:59 PM | 2.22 | 2.0 | 99 | WNW | 0.16 | 0.32 | 1013.21 | 0.0 | 0.0 | 0 | 0.0 | KOREUGEN127 | 0.0 |
1357156 rows × 15 columns
Beware
pd.crosstab( weather['code'], weather['Date'].dt.year)| Date | 2015 | 2024 | 2025 |
|---|---|---|---|
| code | |||
| KOREUGEN127 | 0 | 0 | 98541 |
| KOREUGEN249 | 0 | 0 | 97346 |
| KOREUGEN267 | 0 | 0 | 77024 |
| KOREUGEN295 | 0 | 0 | 97577 |
| KOREUGEN303 | 0 | 0 | 97480 |
| KOREUGEN307 | 0 | 0 | 98181 |
| KOREUGEN31 | 102093 | 102832 | 0 |
| KOREUGEN319 | 0 | 0 | 75207 |
| KOREUGEN321 | 0 | 0 | 92791 |
| KOREUGEN406 | 0 | 0 | 97380 |
| KOREUGEN472 | 0 | 0 | 92487 |
| KOREUGEN539 | 0 | 0 | 82759 |
| KOREUGEN67 | 22514 | 103477 | 0 |
| KOREUGEN74 | 622 | 0 | 18845 |
Goals:
What we’d like to do is understand how well measurements in one part of Eugene/Springfield predicts measurements in another part. In particular, how well does rainfall at one point – the National Weather Service station – predict load for the municipal stormwater system?
So: let’s understand the data, with this goal in mind.
Summarizing
Sometimes we want to compute one (or, a few) summary stats per group.
Mean temperature, by day of the year
(
weather.assign(day = lambda df: df['Date'].dt.dayofyear)
.groupby("day")
.aggregate(
Temperature_C = ("Temperature_C", "mean"),
).reset_index()
)| day | Temperature_C | |
|---|---|---|
| 0 | 1 | 5.384088 |
| 1 | 2 | 7.075204 |
| 2 | 3 | 7.672223 |
| 3 | 4 | 7.919059 |
| 4 | 5 | 8.865104 |
| ... | ... | ... |
| 361 | 362 | 3.729265 |
| 362 | 363 | 3.596271 |
| 363 | 364 | 2.461338 |
| 364 | 365 | 1.456371 |
| 365 | 366 | 3.199965 |
366 rows × 2 columns
# plot itMean total daily precip, by day of the year
(“Mean” across what?)
(
weather.assign(day = lambda df: df['Date'].dt.dayofyear)
.groupby(["day", "code"])
.aggregate(
Precip_Amount_mm = ("Precip_Amount_mm", "sum"),
).reset_index()
.groupby("day")
.aggregate(
Precip_Amount_mm = ("Precip_Amount_mm", "mean"),
).reset_index()
)| day | Precip_Amount_mm | |
|---|---|---|
| 0 | 1 | 4.7825 |
| 1 | 2 | 20.169167 |
| 2 | 3 | 8.295833 |
| 3 | 4 | 8.000833 |
| 4 | 5 | 10.140909 |
| ... | ... | ... |
| 361 | 362 | 4.973333 |
| 362 | 363 | 5.334167 |
| 363 | 364 | 3.848462 |
| 364 | 365 | 0.331538 |
| 365 | 366 | 2.54 |
366 rows × 2 columns
# plot itBrainstorming
What are some other summaries?
Transforming
Other times, we want to transform each value to a new value, but in a way that depends on the group.
Station temperature relative to daily average
weather['seasonal_temp'] = (
weather
.assign(day = lambda df: df['Date'].dt.dayofyear)
.loc[:,['day', 'Temperature_C']]
.groupby("day")
.transform('mean')
)Station temperature relative to regional average
hourly_weather = (
weather.loc[:,['code', 'Date', 'Temperature_C', 'Precip_Amount_mm']]
.assign(Date = lambda df: df['Date'].dt.round('h'))
.groupby(["code", 'Date'])
.mean()
.reset_index()
)
hourly_weather['temp_relative'] = (
hourly_weather.loc[:,['Date','Temperature_C']]
.groupby("Date")
.transform(
lambda x: x - x.mean()
)
)Brainstorming
What’s another transformation?
Multi-level splits
When things get complicated, two important skills are:
- write out what you want to do, carefully, and
- double-check that you’ve done the right thing.
Question: What’s the difference between these? What does each tell you, in real-world terms?
mean of (standard deviation of daily rainfall over January) across locations
standard deviation of (mean of daily rainfall over January) across locations