Python Cheatsheet

Python basics

Variables & data types

# Assignment
name = "Alice"          # str
count = 42             # int
price = 9.99           # float
is_active = True       # bool

# Type casting
int("5")              # → 5
float("3.14")         # → 3.14
str(42)               # → "42"
type(price)           # → <class 'float'>

Operators

Symbol	Name	Example
+	Addition	3 + 2 → 5
-	Subtraction	5 - 2 → 3
*	Multiplication	3 * 4 → 12
/	Division	7 / 2 → 3.5
%	Modulus	7 % 2 → 1
**	Exponent	2 ** 3 → 8
//	Floor div	7 // 2 → 3

Lists

fruits = ["apple", "banana", "cherry"]

fruits[0]            # "apple"  (index)
fruits[-1]           # "cherry" (last)
fruits[1:3]          # ["banana","cherry"]

fruits.append("date")  # add to end
fruits.remove("apple") # remove item
len(fruits)           # count items
fruits.sort()          # sort in place

Dictionaries

person = {
  "name": "Alice",
  "age": 30
}

person["name"]        # "Alice"
person["city"] = "SG" # add key
person.keys()          # all keys
person.values()        # all values
person.items()         # key-value pairs
"age" in person       # True

Tuples & sets

# Tuple — immutable list
coords = (1.3, 103.8)
coords[0]             # 1.3

# Set — unique values only
tags = {"python", "data", "python"}
# → {"python", "data"}
tags.add("ml")
"data" in tags        # True

String methods

s = "  Hello, World!  "

s.strip()              # remove spaces
s.lower()              # "hello, world!"
s.upper()              # "HELLO, WORLD!"
s.replace("Hello","Hi")# "Hi, World!"
s.split(",")           # ["Hello"," World!"]
len(s)                # character count
f"Hi, {name}!"        # f-string

Control flow & loops

If / elif / else

if score >= 90:
    grade = "A"
elif score >= 75:
    grade = "B"
else:
    grade = "C"

Comparison & logical operators

Operator	Meaning
==	Equal to
!=	Not equal
> / <	Greater / less
>= / <=	Greater/less or equal
and	Both conditions true
or	Either condition true
not	Invert a condition

For loops

# Loop over a list
for fruit in fruits:
    print(fruit)

# range(start, stop, step)
for i in range(1, 10, 2):
    print(i)  # 1,3,5,7,9

# Loop with index
for i in range(len(nums)):
    nums[i] = nums[i] * 2

# Loop over dict
for key, val in d.items():
    print(key, val)

While loops & list comprehension

# While loop
n = 0
while n < 5:
    print(n)
    n += 1

# List comprehension
nums = [1,2,3,4,5]
evens = [x * 2 for x in nums]
# → [2, 4, 6, 8, 10]

# With condition
big = [x for x in nums if x > 3]
# → [4, 5]

Functions

Defining & calling

def greet(name):
    return f"Hello, {name}!"

greet("Alice")   # → "Hello, Alice!"

# Default parameter
def greet(name="World"):
    return f"Hello, {name}!"

greet()          # → "Hello, World!"

Parameters & return values

def total_price(price, tax=0.09):
    return price * (1 + tax)

result = total_price(100, 0.1)
# result = 110.0

# Multiple return values
def min_max(lst):
    return min(lst), max(lst)

lo, hi = min_max([1,5,3])

Lambda & built-ins

# Lambda (anonymous function)
square = lambda x: x ** 2
square(4)           # → 16

# Useful built-ins
len([1,2,3])         # 3
sum([1,2,3])         # 6
max([1,5,3])         # 5
min([1,5,3])         # 1
round(3.14159, 2)   # 3.14
sorted([3,1,2])      # [1,2,3]

Scope & docstrings

def calc_roi(gain, cost):
    """
    Calculate return on investment.
    gain: profit amount
    cost: initial investment
    """
    return (gain - cost) / cost * 100

# global variable
TAX_RATE = 0.09     # convention: CAPS

# local variable (only inside fn)
def fn():
    local_var = 5    # inaccessible outside

APIs & web scraping

Basic API call

import requests

response = requests.get("https://api.example.com/data")

# Check status
print(response.status_code)  # 200 = OK

# Parse JSON response
data = response.json()
print(data)             # now a dict

API with parameters & key

# URL with params
url = "https://api.example.com/search"
params = {
    "term": "python",
    "api_key": "YOUR_KEY_HERE"
}

response = requests.get(url, params=params)
data = response.json()

# Access nested data
results = data["results"]
first = results[0]["name"]

HTTP status codes

Code	Meaning
200	OK — success
400	Bad request
401	Unauthorized
404	Not found
500	Server error

Beautiful Soup — scraping

from bs4 import BeautifulSoup
import requests

# Step 1: GET the page HTML
response = requests.get("https://example.com")

# Step 2: Parse HTML
soup = BeautifulSoup(response.text, 'html.parser')

# Step 3: Select elements
soup.select('a')         # all links
soup.select('.class-name')# by CSS class
soup.select('#id')       # by id
soup.find('h1').text     # first h1 text

Scraping in a loop

all_titles = []
items = soup.select('.product-title')

for item in items:
    title = item.text.strip()
    all_titles.append(title)

# Get an attribute value
links = soup.select('a')
for link in links:
    href = link.get('href')
    print(href)

JSON to DataFrame

import pandas as pd

# From API list-of-dicts
df = pd.DataFrame(data["results"])

# From scraped list
df = pd.DataFrame({
    "title": all_titles,
    "url": all_urls
})

Pandas — data analysis

Loading & inspecting

import pandas as pd

df = pd.read_csv("data.csv")

df.head(5)     # first 5 rows
df.tail(3)     # last 3 rows
df.shape       # (rows, cols)
df.info()      # dtypes + nulls
df.describe()  # summary stats
df.columns     # column names
df.dtypes      # data types

Selecting data

# Single column → Series
df["price"]

# Multiple columns → DataFrame
df[["price", "quantity"]]

# Filter rows
df[df["price"] > 50]

# Multiple conditions
df[(df["price"] > 50) & (df["qty"] > 10)]
df[(df["city"] == "SG") | (df["city"] == "KL")]

# Row by index
df.iloc[0]          # first row
df.iloc[0]["name"]  # specific value

Sorting & chaining

# Sort
df.sort_values(by="price", ascending=True)
df.sort_values(by="price", ascending=False)

# Chain filter → column
df[df["color"] == "red"]["price"]

# Chain filter → describe
df[df["color"] == "red"]["price"].describe()

# New column
df["total"] = df["price"] * df["qty"]

Missing data

# Identify nulls
df.isnull().sum()     # count per column

# Drop rows with any null
df.dropna()

# Drop nulls in specific column
df.dropna(subset=["price"])

# Fill nulls
df["price"].fillna(0)
df["price"].fillna(df["price"].mean())

# Drop duplicates
df.drop_duplicates()
df.drop_duplicates(subset=["id"])

groupby & aggregation

# Basic groupby
df.groupby(["category"]).count()
df.groupby(["category"])["sales"].sum()

# Multiple aggregations
df.groupby("category")["sales"].agg([
    "count", "mean", "min", "max"
])

# Group by multiple columns
df.groupby(["region", "category"]).mean()

Combining DataFrames

# Concatenate (stack rows)
pd.concat([df1, df2])

# Concatenate (add columns)
pd.concat([df1, df2], axis=1)

# Merge (like SQL JOIN)
pd.merge(
    left_df, right_df,
    how="left",       # inner/left/right/outer
    left_on="id",
    right_on="user_id"
)

Data visualisation

Pandas plots (quick)

import matplotlib.pyplot as plt

# Line chart
df["price"].plot(kind="line", title="Price")

# Bar chart
df.groupby("category")["sales"].sum().plot(kind="bar")

# Histogram
df["age"].plot(kind="hist", bins=20)

# Scatter
df.plot(kind="scatter", x="area", y="price")

plt.show()

Plotly Express (interactive)

import plotly.express as px

# Bar chart
px.bar(df, x="category", y="sales",
       title="Sales by Category")

# Scatter plot
px.scatter(df, x="area", y="price",
           color="region")

# Line chart
px.line(df, x="date", y="value")

# Histogram
px.histogram(df, x="price", nbins=30)

Correlation & heatmap

# Correlation matrix
df.corr()

# Plotly heatmap
px.imshow(df.corr(),
           title="Correlation Heatmap",
           color_continuous_scale="RdBu")

# Quick value counts
df["category"].value_counts().plot(kind="bar")

Chart types guide

Goal	Chart type
Compare categories	Bar chart
Show relationship	Scatter plot
Show distribution	Histogram / box
Change over time	Line chart
Part of a whole	Pie / stacked bar
Correlation strength	Heatmap

Machine learning models

Linear regression formula

y = a₀ + a₁x₁ + a₂x₂ + … + aₙxₙ

y = predicted value · a₀ = intercept · a₁…aₙ = coefficients · x₁…xₙ = features

Linear regression in sklearn

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

X = df[["area", "bedrooms"]]  # features
y = df["price"]                # target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

mse  = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2   = model.score(X_test, y_test)

KNN classification

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (
    accuracy_score, confusion_matrix
)

X = df[["humidity", "temp"]]
y = df["will_rain"]            # categorical

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

acc = accuracy_score(y_test, y_pred)
cm  = confusion_matrix(y_test, y_pred)

Model evaluation metrics

Metric	Use for	Good = ?
RMSE	Regression	Lower = better
R²	Regression	Closer to 1
Accuracy	Classification	Higher = better
Precision	Classification	Higher = better
Recall	Classification	Higher = better

Confusion matrix: TP = true positive, TN = true negative, FP = false positive (type I), FN = false negative (type II). Context matters — for medical tests, FN is usually more costly.

Sentiment analysis (NLP)

from sklearn.feature_extraction.text import \
    CountVectorizer
from wordcloud import WordCloud

# Bag-of-words vectorisation
vectorizer = CountVectorizer(stop_words="english")
X = vectorizer.fit_transform(df["text"])

# Word cloud
wc = WordCloud(background_color="white")
wc.generate(" ".join(df["text"]))
wc.to_image()

Linear regression assumptions (LINE)

Letter	Assumption
L	Linearity — x and y have a linear relationship
I	Independence — observations don't affect each other
N	Normality — residuals are normally distributed
E	Equal variances (homoscedasticity)

KNN — choosing k

k value	Effect
Small k (e.g. 1)	Very flexible, prone to overfitting
Large k (e.g. 50)	Smoother, may underfit
Rule of thumb	Start with √n; tune with cross-validation

Time series data

Loading & parsing dates

import pandas as pd

df = pd.read_csv(
    "data.csv",
    parse_dates=["date"],   # auto-parse
    index_col="date"        # set as index
)

# Or convert after loading
df["date"] = pd.to_datetime(df["date"])
df = df.set_index("date")

Resampling

# Resample to monthly mean
df.resample("M").mean()

# Common frequency aliases
# "D" = daily   "W" = weekly
# "M" = monthly "Q" = quarterly
# "Y" = yearly

df.resample("W").sum()
df.resample("Q").max()

Rolling windows

# 7-day rolling mean
df["price"].rolling(window=7).mean()

# 30-day rolling std deviation
df["price"].rolling(30).std()

# Add as new column
df["7d_avg"] = df["price"].rolling(7).mean()

# Expanding (cumulative)
df["price"].expanding().mean()

Shifting & diff

# Lag — previous period value
df["prev_price"] = df["price"].shift(1)

# % change from previous period
df["pct_change"] = df["price"].pct_change()

# Absolute change
df["diff"] = df["price"].diff()

# Lead — next period
df["next_price"] = df["price"].shift(-1)

Train/test split (time-ordered)

# CRITICAL: do NOT randomise!
# Keep data in chronological order

split = int(len(df) * 0.8)

train = df.iloc[:split]    # first 80%
test  = df.iloc[split:]    # last 20%

# e.g. 3 years train → 1 year test

Visualising time series

import plotly.express as px

# Line chart (auto-handles datetime index)
px.line(df, y="price", title="Price over time")

# With rolling average overlay
df["7d_avg"] = df["price"].rolling(7).mean()
px.line(df, y=["price", "7d_avg"])