Python basics
Variables & data types
# Assignment name = "Alice" # str count = 42 # int price = 9.99 # float is_active = True # bool # Type casting int("5") # → 5 float("3.14") # → 3.14 str(42) # → "42" type(price) # → <class 'float'>
Operators
| Symbol | Name | Example |
|---|---|---|
| + | Addition | 3 + 2 → 5 |
| - | Subtraction | 5 - 2 → 3 |
| * | Multiplication | 3 * 4 → 12 |
| / | Division | 7 / 2 → 3.5 |
| % | Modulus | 7 % 2 → 1 |
| ** | Exponent | 2 ** 3 → 8 |
| // | Floor div | 7 // 2 → 3 |
Lists
fruits = ["apple", "banana", "cherry"] fruits[0] # "apple" (index) fruits[-1] # "cherry" (last) fruits[1:3] # ["banana","cherry"] fruits.append("date") # add to end fruits.remove("apple") # remove item len(fruits) # count items fruits.sort() # sort in place
Dictionaries
person = {
"name": "Alice",
"age": 30
}
person["name"] # "Alice"
person["city"] = "SG" # add key
person.keys() # all keys
person.values() # all values
person.items() # key-value pairs
"age" in person # True
Tuples & sets
# Tuple — immutable list coords = (1.3, 103.8) coords[0] # 1.3 # Set — unique values only tags = {"python", "data", "python"} # → {"python", "data"} tags.add("ml") "data" in tags # True
String methods
s = " Hello, World! " s.strip() # remove spaces s.lower() # "hello, world!" s.upper() # "HELLO, WORLD!" s.replace("Hello","Hi")# "Hi, World!" s.split(",") # ["Hello"," World!"] len(s) # character count f"Hi, {name}!" # f-string
Control flow & loops
If / elif / else
if score >= 90: grade = "A" elif score >= 75: grade = "B" else: grade = "C"
Comparison & logical operators
| Operator | Meaning |
|---|---|
| == | Equal to |
| != | Not equal |
| > / < | Greater / less |
| >= / <= | Greater/less or equal |
| and | Both conditions true |
| or | Either condition true |
| not | Invert a condition |
For loops
# Loop over a list for fruit in fruits: print(fruit) # range(start, stop, step) for i in range(1, 10, 2): print(i) # 1,3,5,7,9 # Loop with index for i in range(len(nums)): nums[i] = nums[i] * 2 # Loop over dict for key, val in d.items(): print(key, val)
While loops & list comprehension
# While loop n = 0 while n < 5: print(n) n += 1 # List comprehension nums = [1,2,3,4,5] evens = [x * 2 for x in nums] # → [2, 4, 6, 8, 10] # With condition big = [x for x in nums if x > 3] # → [4, 5]
Functions
Defining & calling
def greet(name): return f"Hello, {name}!" greet("Alice") # → "Hello, Alice!" # Default parameter def greet(name="World"): return f"Hello, {name}!" greet() # → "Hello, World!"
Parameters & return values
def total_price(price, tax=0.09): return price * (1 + tax) result = total_price(100, 0.1) # result = 110.0 # Multiple return values def min_max(lst): return min(lst), max(lst) lo, hi = min_max([1,5,3])
Lambda & built-ins
# Lambda (anonymous function) square = lambda x: x ** 2 square(4) # → 16 # Useful built-ins len([1,2,3]) # 3 sum([1,2,3]) # 6 max([1,5,3]) # 5 min([1,5,3]) # 1 round(3.14159, 2) # 3.14 sorted([3,1,2]) # [1,2,3]
Scope & docstrings
def calc_roi(gain, cost): """ Calculate return on investment. gain: profit amount cost: initial investment """ return (gain - cost) / cost * 100 # global variable TAX_RATE = 0.09 # convention: CAPS # local variable (only inside fn) def fn(): local_var = 5 # inaccessible outside
APIs & web scraping
Basic API call
import requests response = requests.get("https://api.example.com/data") # Check status print(response.status_code) # 200 = OK # Parse JSON response data = response.json() print(data) # now a dict
API with parameters & key
# URL with params url = "https://api.example.com/search" params = { "term": "python", "api_key": "YOUR_KEY_HERE" } response = requests.get(url, params=params) data = response.json() # Access nested data results = data["results"] first = results[0]["name"]
HTTP status codes
| Code | Meaning |
|---|---|
| 200 | OK — success |
| 400 | Bad request |
| 401 | Unauthorized |
| 404 | Not found |
| 500 | Server error |
Beautiful Soup — scraping
from bs4 import BeautifulSoup import requests # Step 1: GET the page HTML response = requests.get("https://example.com") # Step 2: Parse HTML soup = BeautifulSoup(response.text, 'html.parser') # Step 3: Select elements soup.select('a') # all links soup.select('.class-name')# by CSS class soup.select('#id') # by id soup.find('h1').text # first h1 text
Scraping in a loop
all_titles = [] items = soup.select('.product-title') for item in items: title = item.text.strip() all_titles.append(title) # Get an attribute value links = soup.select('a') for link in links: href = link.get('href') print(href)
JSON to DataFrame
import pandas as pd # From API list-of-dicts df = pd.DataFrame(data["results"]) # From scraped list df = pd.DataFrame({ "title": all_titles, "url": all_urls })
Pandas — data analysis
Loading & inspecting
import pandas as pd df = pd.read_csv("data.csv") df.head(5) # first 5 rows df.tail(3) # last 3 rows df.shape # (rows, cols) df.info() # dtypes + nulls df.describe() # summary stats df.columns # column names df.dtypes # data types
Selecting data
# Single column → Series df["price"] # Multiple columns → DataFrame df[["price", "quantity"]] # Filter rows df[df["price"] > 50] # Multiple conditions df[(df["price"] > 50) & (df["qty"] > 10)] df[(df["city"] == "SG") | (df["city"] == "KL")] # Row by index df.iloc[0] # first row df.iloc[0]["name"] # specific value
Sorting & chaining
# Sort df.sort_values(by="price", ascending=True) df.sort_values(by="price", ascending=False) # Chain filter → column df[df["color"] == "red"]["price"] # Chain filter → describe df[df["color"] == "red"]["price"].describe() # New column df["total"] = df["price"] * df["qty"]
Missing data
# Identify nulls df.isnull().sum() # count per column # Drop rows with any null df.dropna() # Drop nulls in specific column df.dropna(subset=["price"]) # Fill nulls df["price"].fillna(0) df["price"].fillna(df["price"].mean()) # Drop duplicates df.drop_duplicates() df.drop_duplicates(subset=["id"])
groupby & aggregation
# Basic groupby df.groupby(["category"]).count() df.groupby(["category"])["sales"].sum() # Multiple aggregations df.groupby("category")["sales"].agg([ "count", "mean", "min", "max" ]) # Group by multiple columns df.groupby(["region", "category"]).mean()
Combining DataFrames
# Concatenate (stack rows) pd.concat([df1, df2]) # Concatenate (add columns) pd.concat([df1, df2], axis=1) # Merge (like SQL JOIN) pd.merge( left_df, right_df, how="left", # inner/left/right/outer left_on="id", right_on="user_id" )
Data visualisation
Pandas plots (quick)
import matplotlib.pyplot as plt # Line chart df["price"].plot(kind="line", title="Price") # Bar chart df.groupby("category")["sales"].sum().plot(kind="bar") # Histogram df["age"].plot(kind="hist", bins=20) # Scatter df.plot(kind="scatter", x="area", y="price") plt.show()
Plotly Express (interactive)
import plotly.express as px # Bar chart px.bar(df, x="category", y="sales", title="Sales by Category") # Scatter plot px.scatter(df, x="area", y="price", color="region") # Line chart px.line(df, x="date", y="value") # Histogram px.histogram(df, x="price", nbins=30)
Correlation & heatmap
# Correlation matrix df.corr() # Plotly heatmap px.imshow(df.corr(), title="Correlation Heatmap", color_continuous_scale="RdBu") # Quick value counts df["category"].value_counts().plot(kind="bar")
Chart types guide
| Goal | Chart type |
|---|---|
| Compare categories | Bar chart |
| Show relationship | Scatter plot |
| Show distribution | Histogram / box |
| Change over time | Line chart |
| Part of a whole | Pie / stacked bar |
| Correlation strength | Heatmap |
Machine learning models
Linear regression formula
y = a₀ + a₁x₁ + a₂x₂ + … + aₙxₙ
y = predicted value · a₀ = intercept · a₁…aₙ = coefficients
· x₁…xₙ = features
Linear regression in sklearn
from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error import numpy as np X = df[["area", "bedrooms"]] # features y = df["price"] # target X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42) model = LinearRegression() model.fit(X_train, y_train) y_pred = model.predict(X_test) mse = mean_squared_error(y_test, y_pred) rmse = np.sqrt(mse) r2 = model.score(X_test, y_test)
KNN classification
from sklearn.neighbors import KNeighborsClassifier from sklearn.metrics import ( accuracy_score, confusion_matrix ) X = df[["humidity", "temp"]] y = df["will_rain"] # categorical X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42) knn = KNeighborsClassifier(n_neighbors=5) knn.fit(X_train, y_train) y_pred = knn.predict(X_test) acc = accuracy_score(y_test, y_pred) cm = confusion_matrix(y_test, y_pred)
Model evaluation metrics
| Metric | Use for | Good = ? |
|---|---|---|
| RMSE | Regression | Lower = better |
| R² | Regression | Closer to 1 |
| Accuracy | Classification | Higher = better |
| Precision | Classification | Higher = better |
| Recall | Classification | Higher = better |
Confusion matrix: TP = true positive, TN = true negative, FP = false positive (type I), FN
= false negative (type II). Context matters — for medical tests, FN is usually more costly.
Sentiment analysis (NLP)
from sklearn.feature_extraction.text import \ CountVectorizer from wordcloud import WordCloud # Bag-of-words vectorisation vectorizer = CountVectorizer(stop_words="english") X = vectorizer.fit_transform(df["text"]) # Word cloud wc = WordCloud(background_color="white") wc.generate(" ".join(df["text"])) wc.to_image()
Linear regression assumptions (LINE)
| Letter | Assumption |
|---|---|
| L | Linearity — x and y have a linear relationship |
| I | Independence — observations don't affect each other |
| N | Normality — residuals are normally distributed |
| E | Equal variances (homoscedasticity) |
KNN — choosing k
| k value | Effect |
|---|---|
| Small k (e.g. 1) | Very flexible, prone to overfitting |
| Large k (e.g. 50) | Smoother, may underfit |
| Rule of thumb | Start with √n; tune with cross-validation |
Time series data
Loading & parsing dates
import pandas as pd df = pd.read_csv( "data.csv", parse_dates=["date"], # auto-parse index_col="date" # set as index ) # Or convert after loading df["date"] = pd.to_datetime(df["date"]) df = df.set_index("date")
Resampling
# Resample to monthly mean df.resample("M").mean() # Common frequency aliases # "D" = daily "W" = weekly # "M" = monthly "Q" = quarterly # "Y" = yearly df.resample("W").sum() df.resample("Q").max()
Rolling windows
# 7-day rolling mean df["price"].rolling(window=7).mean() # 30-day rolling std deviation df["price"].rolling(30).std() # Add as new column df["7d_avg"] = df["price"].rolling(7).mean() # Expanding (cumulative) df["price"].expanding().mean()
Shifting & diff
# Lag — previous period value df["prev_price"] = df["price"].shift(1) # % change from previous period df["pct_change"] = df["price"].pct_change() # Absolute change df["diff"] = df["price"].diff() # Lead — next period df["next_price"] = df["price"].shift(-1)
Train/test split (time-ordered)
# CRITICAL: do NOT randomise! # Keep data in chronological order split = int(len(df) * 0.8) train = df.iloc[:split] # first 80% test = df.iloc[split:] # last 20% # e.g. 3 years train → 1 year test
Visualising time series
import plotly.express as px # Line chart (auto-handles datetime index) px.line(df, y="price", title="Price over time") # With rolling average overlay df["7d_avg"] = df["price"].rolling(7).mean() px.line(df, y=["price", "7d_avg"])