A practical guide to window functions—ranking, aggregation, and offset operations computed over a sliding frame—demonstrated side-by-side in SQL and Python's pandas library.
This is a summary and pandas version of Leetcode post by farlowdw
Window functions make difficult problems easy. There’s a reason why window functions show up in most Medium/Hard problems. They’re immensely powerful when used with care. I provide a listing of different window functions and some of the problems in which I have found them to be helpful.
The followings are for experts:
Trying to write a SQL SELECT statement without understanding its logical processing order is like trying to evaluate a complicated mathematical expression with no parentheses. It’s unnecessarily difficult, and it can lead to uncertainty in what the result set for a query should be.
The logical processing order of the SELECT statement is generally as follows:
In SQL, this is different from how code is written. in pandas, however, we can write the code in this order.
.head(3).astype() for a single column.astype({dictionary}) for multiple columns.rename(columns = {dictionary, key = old name, value = new name}).shape, not .shape() & and |, not and and or.// and %.def not_boring_movies(cinema: pd.DataFrame) -> pd.DataFrame:
return cinema[
(cinema['id']%2 == 1) & (cinema['description'] != 'boring') # NOT and! & is bitwise.
].sort_values('rating', ascending = False)
.isin()def find_customers(customers: pd.DataFrame, orders: pd.DataFrame) -> pd.DataFrame:
return customers[~customers['id'].isin(orders['customerId'])][['name']].rename(
columns = {'name': 'Customers'}
)
() def find_customers(customers: pd.DataFrame, orders: pd.DataFrame) -> pd.DataFrame:
return customers[~customers['id'].isin(orders['customerId'])][['name']].rename(
columns = {'name': 'Customers'}
)
.isin()def find_customers(visits: pd.DataFrame, transactions: pd.DataFrame) -> pd.DataFrame:
return visits[
~visits['visit_id'].isin(transactions['visit_id'].drop_duplicates())
].groupby('customer_id').agg(
count_no_trans = ('visit_id', 'size')
).reset_index()
.fillna({column:value})def fill_missing_values(products: pd.DataFrame) -> pd.DataFrame:
return products.fillna({'quantity': 0})
.replace( {'m':'f', 'f':'m'}, inplace = True)def swap_salary(salary: pd.DataFrame) -> pd.DataFrame:
salary['sex'].replace( {'m':'f', 'f':'m'}, inplace = True)
return salary
.drop_duplicates(subset, keep, inplace)import pandas as pd
def find_primary_department(employee: pd.DataFrame) -> pd.DataFrame:
return employee.sort_values(
'primary_flag', ascending = False
).drop_duplicates(
subset = 'employee_id', keep = 'first'
)[['employee_id', 'department_id']]
.duplicated(columnname) outputs a boolean vectorimport pandas as pd
def duplicate_emails(person: pd.DataFrame) -> pd.DataFrame:
return person[person.duplicated('email')].drop_duplicates('email').rename(columns = {'email':'Email'})[['Email']]
.str.len()import pandas as pd
def invalid_tweets(tweets: pd.DataFrame) -> pd.DataFrame:
return tweets[tweets['content'].str.len()>15][['tweet_id']]
.str.capitalize()import pandas as pd
def fix_names(users: pd.DataFrame) -> pd.DataFrame:
users['name'] = users['name'].str.capitalize()
return users.sort_values('user_id')
.str.startswith('M')import pandas as pd
def calculate_special_bonus(employees: pd.DataFrame) -> pd.DataFrame:
employees['bonus'] = (
(~employees['name'].str.startswith('M') &
(employees['employee_id']%2==1)
)*employees['salary'])
return employees[
['employee_id', 'bonus']
].sort_values('employee_id')
.str.contains()import pandas as pd
def find_patients(patients: pd.DataFrame) -> pd.DataFrame:
return patients[
(patients.conditions.str.contains(' DIAB1')) |
(patients.conditions.str.startswith('DIAB1'))
]
import pandas as pd
def categorize_products(activities: pd.DataFrame) -> pd.DataFrame:
return activities.groupby('sell_date')['product'].agg(
[
('num_sold', 'nunique'),
('products', lambda x: ','.join(sorted(x.unique())))
# to Series.unique() outputs a python list
# sorted(list)
]
).reset_index()
DataFrame.groupby User GuideDataFrame.rolling Documentation