Python Cheat sheet
This cheat sheet is adapted from the ADA course materials, with special thanks to Mehdi, Jiaming, Yanzi and Davide for their contributions.
Panda basics
Initialize a dataframe
1
2
3
4
data = pd.DataFrame({'value':[632, 1638, 569, 115, 433, 1130, 754, 555],
'patient':[1, 1, 1, 1, 2, 2, 2, 2],
'phylum':['Firmicutes', 'Proteobacteria', 'Actinobacteria',
'Bacteroidetes', 'Firmicutes', 'Proteobacteria', 'Actinobacteria', 'Bacteroidetes']})
Rename the column names
1
2
# Rename the 'old_name' column to 'new_name'
df.rename(columns={'old_name': 'new_name'}, inplace=True)
Set indexes
Set one column as index
1 2 3 4 5 6 7 8 9
# Create a sample DataFrame data = {'Name': ['Alice', 'Bob', 'Charlie'], 'Age': [25, 30, 35], 'City': ['New York', 'San Francisco', 'Los Angeles']} df = pd.DataFrame(data) # Set the 'Name' column as the index df.set_index('Name', inplace=True)
Reset indexes
1 2 3 4
# Reset the index df.reset_index(inplace=True) # After resetting the index, DataFrame will have the default integer index
Reshape (Concat and join)
Concat (along the rows or cols)
1 2 3 4 5 6 7 8 9 10 11 12
# Sample DataFrames df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2'], 'B': ['B0', 'B1', 'B2']}) df2 = pd.DataFrame({'A': ['A3', 'A4', 'A5'], 'B': ['B3', 'B4', 'B5']}) # Concatenate along rows (axis=0) result = pd.concat([df1, df2], axis=0) # Concatenate along cols (axis=1) result = pd.concat([df1, df2], axis=1)
Join (the cols)
1 2 3 4 5 6 7 8 9 10 11
# Sample DataFrames df1 = pd.DataFrame({'key1': ['A', 'B', 'C', 'D'], 'key2': ['X', 'Y', 'Z', 'X'], 'value1': [1, 2, 3, 4]}) df2 = pd.DataFrame({'key1': ['B', 'D', 'E', 'F'], 'key2': ['Y', 'X', 'Z', 'W'], 'value2': [5, 6, 7, 8]}) # Join based on multiple columns ('key1' and 'key2') result = pd.merge(df1, df2, on=['key1', 'key2'], how='inner')
- Inner Join: Returns rows with common values in both dataframes.
- Outer Join (Full Outer Join): Returns all rows and fills in missing values with NaN.
- Left Join (Left Outer Join): Returns all rows from the left dataframe and matching rows from the right dataframe.
- Right Join (Right Outer Join): Returns all rows from the right dataframe and matching rows from the left dataframe.
Sort
Sort values
1 2 3 4 5 6 7 8 9 10 11 12 13
# Sort by 'Age' in ascending order, then by 'Salary' in descending order df_sorted = df.sort_values(by=['Age', 'Salary'], ascending=[True, False]) # numpy sorted numbers = [3, 1, 4, 1, 5, 9, 2, 6, 5, 3, 5] # Sorting in ascending order (default) sorted_numbers_asc = sorted(numbers) print(sorted_numbers_asc) # Output: [1, 1, 2, 3, 3, 4, 5, 5, 5, 6, 9] # Sorting in descending order sorted_numbers_desc = sorted(numbers, reverse=True) print(sorted_numbers_desc) # Output: [9, 6, 5, 5, 5, 4, 3, 3, 2, 1, 1]
Sort indexes
1 2
# Sort the index in descending order df_sorted_index_desc = df.sort_index(ascending=False)
Find, Replace, Drop
Loc
1 2 3 4 5 6 7 8 9
# Select columns in positions 1, 2 and 5 (first column is 0). df.iloc[10:20] Select rows 10-20. df.iloc[:, [1, 2, 5]] # Select all columns between x2 and x4 (inclusive). df.loc[:, 'x2':'x4'] # Select rows meeting logical condition, and only the specific columns . df.loc[df['a'] > 10, ['a’, 'c']] # loc a series item with index tmp_a.loc[2008]
Filtering rows
1 2 3 4 5 6 7
# filter numbers filtered_df = df[df['Age'] > 30] # filter string male_df = df[df['Gender'] == 'Male'] filtered_df = df[df['column_1'].str.endswith("ple")] # multiple criteria filtered_df = df[(df['Column1'] >= 20) & (df['Column2'] < 40)]
Replacing data
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
# replace string df['Gender'].replace('Male', 'M', inplace=True) # Replace the end of strings in 'column_1' from "ple" to "abc" df['column_1'] = df['column_1'].str.replace(r'ple$', 'abc') # Replace numbers larger than 100 in the 'Value' column with 100 df.loc[df['Value'] > 100, 'Value'] = 100 # replace multiple data cdystonia.treat.replace({0:'Placebo', 1:'5000U', 2:'10000U'}) # Fill na # Fill missing values with a specific value (e.g., 0) df_filled = df.fillna(0) # Fill missing values with the mean of the column df_filled_mean = df.fillna(df.mean()) # Removing duplicates df.drop_duplicates(inplace=True) # Changing data types df['column_name'] = df['column_name'].astype('desired_data_type')
Drop
1 2 3 4 5 6 7 8
# Drop rows df.drop(df[df['Salary'] < 50000].index, inplace=True) # Drop cols df.drop(columns=['Column1', 'Column2'], inplace=True) # Drop rows with missing values df_dropped = df.dropna() # Remove duplicate rows df_no_duplicates = df.drop_duplicates()
Numerical features
Value counts
1 2 3 4 5
# Count number of rows with each unique value of variable df['w'].value_counts() # number of distinct values in a column. df['w'].nunique()
Mean, min, max, std, median
1 2 3 4 5 6 7
# per col, same for mean, max, std, median min_value = df['Column1'].min() # or use this df['Column1'].agg(['mean', 'median', 'min', 'max']) # per row std_values_per_row = df.std(axis=1)
Percentiles
1 2 3 4
df['Column1'].quantile(0.25) # 25th percentile # cut data into percentiles quintiles = pd.qcut([v for _, v in sorted(scores_dict.items())], q=5, labels=False)
Correlation and covariance
1 2
df[['Column1', 'Column2']].corr() df[['Column1', 'Column2']].cov()
Groupby
Group and get the mean, count, median, etc
After group-by, the group-by column will be the index of the output.
1 2 3 4 5 6 7 8 9 10 11 12 13
df.groupby('Category')['Value'].mean() # Calculate counts in column A for each unique value in column B counts_by_category = df.groupby('column_B')['column_A'].count() # Calculate the mean of corresponding values in column A for each unique value in column B mean_by_category = df.groupby('column_B')['column_A'].mean() # Calculate the median of column D for each unique combination of columns A and B median_by_combination = df.groupby(['column_A', 'column_B'])['column_D'].median() # Calculate the maximum value in column E for each year in column F max_by_year = df.groupby('column_F')['column_E'].max()
Calculate anything after group-by
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
# any customized function # get the mean of the positive numbers df.groupby(['YEA','TGT'])['VOT'].apply(lambda x: np.mean(x>0)) # get the range def price_range(x): range = x.max() - x.min() # range = x.quantile(0.975) - x.quantile(0.025) return range result = df.groupby('Category')['Price'].apply(price_range) # Calculate the fraction of a certain category X in column C # for each unique value in column B def calculate_fraction(data, category_x): total_count = data['column_C'].count() x_count = data[data['column_C'] == category_x]['column_C'].count() return x_count / total_count fraction_by_category = df.groupby('column_B').apply(lambda x: calculate_fraction(x, 'category_X'))
Reset the index of the group-by result
1 2 3 4
# reset the index ('YEA', 'TGT') of the groupby result # and set the new index ('YEA') tmp = df.groupby(["YEA", "TGT"]).VOT tmp_a = tmp.count().reset_index().groupby("YEA").VOT.count()
Aggregate functions
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
import pandas as pd data = {'Category': ['A', 'B', 'A', 'B', 'A'], 'Value': [10, 20, 15, 25, 30]} df = pd.DataFrame(data) # Group by 'Category' and aggregate 'Value' grouped = df.groupby('Category') result = grouped['Value'].agg(['sum', 'mean', 'max']).reset_index() print(result) >>> Category sum mean max 0 A 55 18.333333 30 1 B 45 22.500000 25
Math operations
1
2
3
4
5
6
7
8
# for one col
def custom_function(x):
return x ** 2
df['Squared'] = df['Column1'].apply(custom_function)
# for multiple cols
df['Result'] = df['Column1'] + df['Column2']
baseball['obp']=baseball.apply(lambda p: (p.h+p.bb+p.hbp)/(p.ab+p.bb+p.hbp+p.sf) if (p.ab+p.bb+p.hbp+p.sf) != 0.0 else 0.0, axis=1)
Visualization
Basic charts
Use Case of different plots
- Bar Charts:
- Use Case: Comparing values across categories.
- Example: Comparing sales performance for different products.
- Histograms:
- Use Case: Showing the distribution of a continuous variable.
- Example: Displaying the distribution of ages in a population.
- Line Charts:
- Use Case: Visualizing trends over a continuous variable, often time.
- Example: Showing the stock prices over a period of time.
- Scatter Plots:
- Use Case: Examining the relationship between two continuous variables.
- Example: Plotting the relationship between height and weight.
- Box Plots (Box-and-Whisker Plots):
- Use Case: Displaying the distribution of a dataset and highlighting outliers.
- Example: Comparing the distribution of exam scores across different classes.
- Pie Charts:
- Use Case: Showing the proportion of each category in a whole.
- Example: Displaying the percentage distribution of expenses in a budget.
- Heatmaps:
- Use Case: Visualizing the magnitude of a phenomenon across two categorical variables.
- Example: Displaying the correlation matrix between variables.
- Violin Plots:
- Use Case: Combining the benefits of a box plot and a kernel density plot.
- Example: Visualizing the distribution of a variable across different groups.
- Radar Charts:
- Use Case: Comparing multiple quantitative variables across different categories.
- Example: Comparing the skill levels of individuals in different sports.
- Treemaps:
- Use Case: Displaying hierarchical data as nested rectangles.
- Example: Visualizing the composition of expenses in a budget hierarchy.
- Choropleth Maps:
- Use Case: Showing spatial variations in a variable across regions.
- Example: Displaying population density across different countries.
- Bar Charts:
Bar chart
1 2 3 4 5 6 7
import matplotlib.pyplot as plt plt.bar(df['categories'], df['values'], color='blue') plt.xlabel('Categories') plt.ylabel('Values') plt.title('Bar Chart') plt.show()
Histogram
1 2 3 4 5 6 7 8 9
plt.hist(df['variable'], bins=20, color='green', alpha=0.7) plt.xlabel('Variable') plt.ylabel('Frequency') plt.title('Histogram') plt.show()plt.hist(df['variable'], bins=20, color='green', alpha=0.7) plt.xlabel('Variable') plt.ylabel('Frequency') plt.title('Histogram') plt.show()
Line chart
1 2 3 4 5
plt.plot(df['time'], df['values'], marker='o', linestyle='-', color='red') plt.xlabel('Time') plt.ylabel('Values') plt.title('Line Chart') plt.show()
Scatter plot
1 2 3 4 5
plt.scatter(df['x'], df['y'], color='purple', alpha=0.5) plt.xlabel('X-axis') plt.ylabel('Y-axis') plt.title('Scatter Plot') plt.show()
Box plot
1 2 3 4 5 6 7
import seaborn as sns sns.boxplot(x=df['category'], y=df['values'], palette='Set3') plt.xlabel('Category') plt.ylabel('Values') plt.title('Box Plot') plt.show()
Pie chart
1 2 3
plt.pie(df['values'], labels=df['categories'], autopct='%1.1f%%', colors=['gold', 'lightcoral']) plt.title('Pie Chart') plt.show()
Heatmap
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
import seaborn as sns heatmap_data = df.pivot(index='row_variable', columns='column_variable', values='values') sns.heatmap(heatmap_data, cmap='coolwarm', annot=True, fmt=".2f", linewidths=.5) plt.title('Heatmap') plt.show() # normalize the heatmap # Your original array data = np.array([[ 63, 5, 4, 0, 1], [ 61, 68, 16, 11, 4], [ 123, 87, 107, 35, 9], [ 136, 145, 192, 192, 56], [ 212, 306, 493, 681, 1381]]) # Normalize rows to sum up to 1 normalized_data = data / data.sum(axis=1, keepdims=True) # Create a heatmap sns.heatmap(normalized_data, annot=True, fmt=".2f") # Format to 2 decimal places plt.ylabel("Row Index") plt.xlabel("Column Index") plt.title("Normalized Heatmap (Rows sum to 1)") plt.show()
Violin plot
1 2 3 4 5
sns.violinplot(x=df['category'], y=df['values'], palette='viridis') plt.xlabel('Category') plt.ylabel('Values') plt.title('Violin Plot') plt.show()
Radar chart
1 2 3 4 5 6 7 8 9 10 11 12 13
from math import pi categories = list(df.columns[1:]) values = df.iloc[0].tolist()[1:] values += values[:1] # To close the circular graph angles = [n / float(len(categories)) * 2 * pi for n in range(len(categories))] angles += angles[:1] plt.polar(angles, values, 'o-', color='orange', linewidth=2) plt.fill(angles, values, color='orange', alpha=0.25) plt.title('Radar Chart') plt.show()
Treemap
1 2 3 4 5 6
import squarify squarify.plot(sizes=df['values'], label=df['categories'], color=['skyblue', 'salmon', 'lightgreen']) plt.title('Treemap') plt.axis('off') plt.show()
Choropleth map
1 2 3 4 5 6 7 8 9 10 11
import geopandas as gpd from mpl_toolkits.axes_grid1 import make_axes_locatable # Assuming 'gdf' is a GeoDataFrame with geometry and value columns fig, ax = plt.subplots(1, 1, figsize=(10, 8)) divider = make_axes_locatable(ax) cax = divider.append_axes("right", size="5%", pad=0.1) gdf.plot(column='values', cmap='YlOrRd', linewidth=0.8, ax=ax, edgecolor='0.8', legend=True, cax=cax) ax.set_title('Choropleth Map') plt.show()
Q-Q plot
1 2 3 4 5 6 7 8 9 10 11 12 13
import matplotlib.pyplot as plt import scipy.stats as stats import numpy as np # Assuming 'sample_data' is your sample data sample_data = np.random.normal(loc=0, scale=1, size=1000) # Replace this with your actual sample data # Create a Q-Q plot stats.probplot(sample_data, dist='norm', plot=plt) plt.title('Q-Q Plot') plt.xlabel('Theoretical Quantiles') plt.ylabel('Sample Quantiles') plt.show()
Pair Plot
1 2
import seaborn as sns sns.pairplot(data)
Dimension reduction (TSNE/PCA)
1
2
3
4
5
6
7
8
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
# dimension reduction using tsne
X_reduced_tsne = TSNE(n_components=2, init='random', learning_rate='auto', random_state=0).fit_transform(X)
# dimension reduction using pca
X_reduced_pca = PCA(n_components=2).fit(X10d).transform(X)
CDF and CCDF
The Cumulative Distribution Function (CDF) plot is a lin-lin plot with data overlay and confidence limits. It shows the cumulative density of any data set over time (i.e., Probability vs. size).
1
2
3
4
5
6
7
8
9
10
import seaborn as sns
import warnings
# Creates complementary CDF with log scale
# FOR CDF: complementary=False
sns.ecdfplot(df[df["throws"] == "L"].salary, label="Left-handed", complementary=True)
sns.ecdfplot(df[df["throws"] == "R"].salary, label="Right-handed", complementary=True)
plt.xscale("log")
plt.legend()
plt.title("CCDF Salary")
A bit more about the calculation of CDF and CCDF
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
def get_ccdf(var_list):
"""
Get the complementary culmulative distribution function of a list of values
"""
var_count = Counter(var_list)
var_count = sorted(var_count.items(), key=lambda x: x[0])
ccdf = 1 - np.cumsum([x[1] for x in var_count]) / sum([x[1] for x in var_count])
return [x[0] for x in var_count], ccdf
def get_cdf(var_list):
"""
Get the culmulative distribution function of a list of values
"""
var_count = Counter(var_list)
var_count = sorted(var_count.items(), key=lambda x: x[0])
cdf = np.cumsum([x[1] for x in var_count]) / sum([x[1] for x in var_count])
return [x[0] for x in var_count], cdf
PDF (probability density function)
The term Probability is used in this instance to describe the size of the total population that will fail (failure data or any other data) by size.
1
2
3
4
5
6
7
8
import seaborn as sns
import matplotlib.pyplot as plt
# Plotting the PDF
sns.histplot(df[df["throws"] == "L"].salary, kde=True, stat="density", label="Left-handed")
# Adding labels and title for clarity
plt.xlabel
Layout
1
2
3
4
5
6
7
8
9
10
11
12
13
# Creating Multiple Subplots
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 8))
axes[0, 0].plot(df['x'], df['y'], marker='o', linestyle='-', color='red')
axes[0, 0].set_xlabel('X-axis')
axes[0, 0].set_ylabel('Y-axis')
axes[0, 0].set_title('Subplot 1')
# Add more subplots as needed
plt.tight_layout()
plt.show()
Simple statistics
Sample data
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
#make 10 samples with replacement
sample1_counties = df.sample(n = 10, replace = True)
#make 10 samples without replacement
sample1_counties = df.sample(n = 10, replace = False)
#sometimes we want to sample in an ublanaced way, so that we upsample datapoints of certain characteristic,
#and downsample the others. this can be acieved with weights parameter
#here we sample by upsampling counties with large population
sample2_counties = df.sample(n = 10, replace = False, weights = df['TotalPop'])
# sample a fraction
# Calculate the desired sample size as 0.1% of the total rows
percentage = 0.1 # Change this to the desired percentage
sample_size = int(len(df) * (percentage / 100))
# Perform the sample
sampled_df = df.sample(n=sample_size, replace=False)
Distribution test
1
2
3
4
5
6
7
8
9
10
11
12
13
from statsmodels.stats import diagnostic
# get statistical desription for every column of df
df.describe()
# test to verify if the data come from a normal distrbution
diagnostic.kstest_normal(df['IncomePerCap'].values, dist = 'norm')
#output: (statistic, p-value)
#p_value < 0.05 -> not a normal distribution!
# test to verify if the data come from an exponential distribution
diagnostic.kstest_normal(df['IncomePerCap'].values, dist = 'exp')
#output: (statistic, p-value)
#p_value < 0.05 -> not exponential distribution!
95% CI of the mean
Calculate and point plot
1 2 3 4 5 6 7 8 9 10
salaries = df.groupby(["throws"]).salary.agg(["mean", "sem"]) salaries["low_ci"] = salaries["mean"] - 1.96 * salaries["sem"] salaries["high_ci"] = salaries["mean"] + 1.96 * salaries["sem"] # shows confidence intervals display(salaries) # simple plot sns.pointplot(x="throws", y="salary", data=df_pitching) plt.title("Average salary for left-\\nand right-handed throwers")
Line plot with 95% confidence intervals
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
fig, axs = plt.subplots(1, 2, figsize=(14,4)) for idx, col_agg in enumerate(["salary", "BAOpp"]): df_col_agg = df.groupby(["yearID", "throws"])[col_agg].agg(["mean", "sem"]).reset_index() df_col_agg_L = df_col_agg[df_col_agg.throws == "L"] df_col_agg_R = df_col_agg[df_col_agg.throws == "R"] axs[idx].plot(df_col_agg_L["yearID"], df_col_agg_L["mean"], color="tab:red", label="lefties") axs[idx].fill_between(df_col_agg_L["yearID"], df_col_agg_L["mean"] - 1.96 * df_col_agg_L["sem"], df_col_agg_L["mean"] + 1.96 * df_col_agg_L["sem"], alpha=0.25 , color="tab:red") axs[idx].plot(df_col_agg_R["yearID"], df_col_agg_R["mean"], color="tab:blue", label="righties") axs[idx].fill_between(df_col_agg_R["yearID"], df_col_agg_R["mean"] - 1.96 * df_col_agg_R["sem"] , df_col_agg_R["mean"] + 1.96 * df_col_agg_R["sem"], alpha=0.25 , color="tab:blue") print("avg", col_agg, "in 1999 for lefties:", df_col_agg_L[df_col_agg_L["yearID"] == 1999]["mean"].values[0]) print("avg", col_agg, "in 1999 for righties:", df_col_agg_R[df_col_agg_R["yearID"] == 1999]["mean"].values[0]) axs[0].set_title("A) Average salary per year") axs[1].set_title("B) Avg. opponents' batting average per year") axs[0].set_xlabel("Year") axs[1].set_xlabel("Year") plt.legend()
T-test (of two means)
1
2
3
4
5
6
7
8
# t-test for the null hypothesis that the two independent samples have identical means
# dropna if needed!
x1 = df[(df["throws"] == "L")].salary.dropna().values
x2 = df[(df["throws"] == "R")].salary.dropna().values
print(np.mean(x1), np.mean(x2))
display(scipy.stats.ttest_ind(x1, x2))
# output: (statistic, p_value)
#p_value < 0.05 : we reject null hypothesis
Correlation test
1
2
3
4
5
6
7
8
9
# pearson's correlation : amount of linear dependence
stats.pearsonr(df['IncomePerCap'],df['Employed'])
# output: (person correlation, p_value)
# p_value < 0.05 : significant correlation
# spearman's rank correlation
stats.spearmanr(df['IncomePerCap'],df['Employed'])
# output: (spearman correlation, p_value)
# p_value < 0.05, significant correlation
Machine learning
Pre-processing
Train test split
1 2 3 4 5
from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123, stratify=y)
Change categorial data to numerical
1 2 3 4 5 6 7 8 9
import pandas as pd data = {'Color': ['Red', 'Green', 'Blue', 'Green', 'Red']} df = pd.DataFrame(data) dummies = pd.get_dummies(df['Color'], prefix='Color') # or (df["gender"] == "F").astype("int").values
Machine learns
Regression
logistic regression using smf
1 2 3 4 5 6 7 8
import statsmodels.api as sm import statsmodels.formula.api as smf mod = smf.logit(formula='y ~ x + C(discrete_x)', data=df) res = mod.fit() print(res.summary())
Linear regression using smf
1 2 3 4 5 6 7 8
import statsmodels.api as sm import statsmodels.formula.api as smf mod = smf.ols(formula='y ~ x + C(discrete_x)', data=df) res = mod.fit() print(res.summary())
Clustering
K-Means for multiple values of K
Overall example
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28
from sklearn.cluster import KMeans MIN_CLUSTERS = 2 MAX_CLUSTERS = 10 # Compute number of row and columns COLUMNS = 3 ROWS = math.ceil((MAX_CLUSTERS-MIN_CLUSTERS)/COLUMNS) fig, axs = plt.subplots(ROWS, COLUMNS, figsize=(10,8), sharey=True, sharex=True) # Plot the clusters for n_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS+1): current_column = (n_clusters-MIN_CLUSTERS)%COLUMNS current_row = (n_clusters-MIN_CLUSTERS)//COLUMNS # Get the axis where to add the plot ax = axs[current_row, current_column] # Cluster the data with the current number of clusters kmean = KMeans(n_clusters=n_clusters, random_state=42).fit(X) # Plot the data by using the labels as color ax.scatter(X[:,0], X[:,1], c=kmean.labels_, alpha=0.6) ax.set_title("%s clusters"%n_clusters) ax.set_xlabel("Feature 1") ax.set_ylabel("Feature 2") # Plot the centroids for c in kmean.cluster_centers_: ax.scatter(c[0], c[1], marker="+", color="red") plt.tight_layout()
silhouette score to find optimal k for k-means
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
from sklearn.metrics import silhouette_score from sklearn.cluster import KMeans silhouettes = [] # Try multiple k for k in range(2, 11): # Cluster the data and assigne the labels labels = KMeans(n_clusters=k, random_state=10).fit_predict(X) # Get the Silhouette score score = silhouette_score(X, labels) silhouettes.append({"k": k, "score": score}) # Convert to dataframe silhouettes = pd.DataFrame(silhouettes) # Plot the data plt.plot(silhouettes.k, silhouettes.score) plt.xlabel("K") plt.ylabel("Silhouette score")
elbow method to find optimal k for k-means
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
from sklearn.cluster import KMeans def plot_sse(features_X, start=2, end=11): sse = [] for k in range(start, end): # Assign the labels to the clusters kmeans = KMeans(n_clusters=k, random_state=10).fit(features_X) sse.append({"k": k, "sse": kmeans.inertia_}) sse = pd.DataFrame(sse) # Plot the data plt.plot(sse.k, sse.sse) plt.xlabel("K") plt.ylabel("Sum of Squared Errors") plot_sse(X)
DBSCAN for multiple values of epsilon
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
from sklearn.cluster import DBSCAN # Create a list of eps eps_list = np.linspace(0.05, 0.15, 14) # Compute number of row and columns COLUMNS = 7 ROWS = math.ceil(len(eps_list)/COLUMNS) fig, axs = plt.subplots(ROWS, COLUMNS, figsize=(12, 4), sharey=True, sharex=True) for i in range(0, len(eps_list)): eps = eps_list[i] current_column = i%COLUMNS current_row = i//COLUMNS ax = axs[current_row, current_column] labels = DBSCAN(eps=eps).fit_predict(X_moons) ax.scatter(X_moons[:,0], X_moons[:,1], c=labels, alpha=0.6) ax.set_title("eps = {:.3f}".format(eps)) plt.tight_layout()
Finetune parameters (Grid search)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# 2. Import libraries and modules
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
import joblib
# 3. Load red wine data.
dataset_url = '<https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv>'
data = pd.read_csv(dataset_url, sep=';')
# 4. Split data into training and test sets
y = data.quality
X = data.drop('quality', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.2,
random_state=123,
stratify=y)
# 5. Declare data preprocessing steps
pipeline = make_pipeline(preprocessing.StandardScaler(),
RandomForestRegressor(n_estimators=100,
random_state=123))
# 6. Declare hyperparameters to tune
hyperparameters = { 'randomforestregressor__max_features' : ['auto', 'sqrt', 'log2'],
'randomforestregressor__max_depth': [None, 5, 3, 1]}
# 7. Tune model using cross-validation pipeline
clf = GridSearchCV(pipeline, hyperparameters, cv=10)
clf.fit(X_train, y_train)
# 8. Refit on the entire training set
# No additional code needed if clf.refit == True (default is True)
# 9. Evaluate model pipeline on test data
pred = clf.predict(X_test)
print( r2_score(y_test, pred) )
print( mean_squared_error(y_test, pred) )
print("Accuracy:", accuracy_score(y_test, pred))
# Print coefficients and intercept
print("Coefficients:", model.coef_)
print("Intercept:", model.intercept_)
# 10. Save model for future use
joblib.dump(clf, 'rf_regressor.pkl')
# To load: clf2 = joblib.load('rf_regressor.pkl')
Evaluation
ROC
1 2
from sklearn.metrics import roc_auc_score print("roc score", roc_auc_score(y, y_pred))
Confusion matrix
1 2 3 4 5 6 7 8 9 10 11 12 13
from sklearn.metrics import confusion_matrix import seaborn as sns # true_labels: the true labels of the test set # predicted_labels: the labels predicted by your model cm = confusion_matrix(true_labels, predicted_labels) plt.figure(figsize=(10,7)) sns.heatmap(cm, annot=True, fmt="d", cmap='Blues') plt.title('Confusion Matrix') plt.ylabel('True Label') plt.xlabel('Predicted Label') plt.show()
Text Data
Remove special characters like \n and \t
1
text = [" ".join(b.split()) for b in text]
Entities extraction
1 2
for ent in doc.ents: print(ent.text, ent.label_)
Removing stop words
1 2 3
import spacy spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS non_stop_words= [token.text for token in doc if not token.is_stop]
Noun Chunks
1 2
for chunk in doc.noun_chunks: print(chunk.text)
Counting word occurences
1 2 3 4 5
from collections import Counter words = [token.text for token in doc] word_freq = Counter(words) common_words = word_freq.most_common()
Sentiment Analysis
1 2 3 4 5 6 7 8 9 10 11
import vaderSentiment from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer analyzer = SentimentIntensityAnalyzer() vs = analyzer.polarity_scores(example) #The sentiment score consits of four values. Neutral, positive and negative sum to one. The final score is obtained by thresholding the compound value (e.g. +/-0.05) print('Negative sentiment:',vs['neg']) print('Neutral sentiment:',vs['neu']) print('Positive sentiment:',vs['pos']) print('Compound sentiment:',vs['compound'])
Bag of words representation
1 2 3 4 5 6 7 8 9
from sklearn.feature_extraction.text import CountVectorizer vectorizer = CountVectorizer() #initialize and specify minumum number of occurences to avoid untractable number of features #vectorizer = CountVectorizer(min_df = 2) if we want high frequency #create bag of words features X = vectorizer.fit_transform(chunks)
Topic detection
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52
import pyLDAvis.gensim_models STOPWORDS = spacy.lang.en.stop_words.STOP_WORDS processed_docs = list() for doc in nlp.pipe(chunks, n_process=5, batch_size=10): # Process document using Spacy NLP pipeline. ents = doc.ents # Named entities # Keep only words (no numbers, no punctuation). # Lemmatize tokens, remove punctuation and remove stopwords. doc = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop] # Remove common words from a stopword list and keep only words of length 3 or more. doc = [token for token in doc if token not in STOPWORDS and len(token) > 2] # Add named entities, but only if they are a compound of more than word. doc.extend([str(entity) for entity in ents if len(entity) > 1]) processed_docs.append(doc) docs = processed_docs del processed_docs # Add bigrams too from gensim.models.phrases import Phrases # Add bigrams to docs (only ones that appear 15 times or more). bigram = Phrases(docs, min_count=15) for idx in range(len(docs)): for token in bigram[docs[idx]]: if '_' in token: # Token is a bigram, add to document. docs[idx].append(token) # models from gensim.models import LdaMulticore params = {'passes': 10, 'random_state': seed} base_models = dict() model = LdaMulticore(corpus=corpus, num_topics=4, id2word=dictionary, workers=6, passes=params['passes'], random_state=params['random_state']) # plot topics data = pyLDAvis.gensim_models.prepare(model, corpus, dictionary) pyLDAvis.display(data) # assignment sent_to_cluster = list() for n,doc in enumerate(corpus): if doc: cluster = max(model[doc],key=lambda x:x[1]) sent_to_cluster.append(cluster[0])
Regex (Regular Expressions)
Simply use pandas to match strings
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
# find a pattern df['Column'].str.contains('pattern') # Start of a String df['Column'].str.startswith('start_pattern') # end of a string df['Column'].str.endswith('end_pattern') # match with any character (except a new line) df['Column'].str.contains('a.b') # match a set. [abc]: Matches any single character 'a', 'b', or 'c'. df['Column'].str.contains('[aeiou]') # match a range. [0-9]: Matches any digit from 0 to 9. df['Column'].str.contains('[0-9]') # multiple chars df['Column'].str.contains('a{2}') # Matches 'aa' df['Column'].str.contains('a{2,4}') # Matches 'aa', 'aaa', or 'aaaa' # match a word boundary df['Column'].str.contains(r'\\bword\\b') # exluding this pattern df['Column'].str.contains(r'^(?!exclude_pattern).*$')
Re
1
2
3
4
5
6
7
8
9
10
11
import re
def extract_matched_strings(text):
pattern = r'\\d+' # Example: Match one or more digits
matches = re.findall(pattern, text)
return ', '.join(matches) if matches else None
data = {'Column1': ['abc123', 'def456', 'xyz789']}
df = pd.DataFrame(data)
df['Matched'] = df['Column1'].apply(extract_matched_strings)
Regex examples
Search and test here:
regex101: build, test, and debug regex
pyrexp: test, and visualize regex
Network analysis
Generate the network
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
import networkx as nx
# Assuming 'nodes_df' is the DataFrame for nodes and 'edges_df' is the DataFrame for edges
G = nx.Graph()
# Adding nodes with attributes
for index, node_data in nodes_df.iterrows():
G.add_node(node_data['node_id'], attr1=node_data['attribute1'], attr2=node_data['attribute2'])
# Adding edges
for index, edge_data in edges_df.iterrows():
G.add_edge(edge_data['source'], edge_data['target'], weight=edge_data['weight'])
======================================================================
# or
G = nx.from_pandas_edgelist(pd.read_csv("./to_push_as_is/wiki-RfA.csv.gz"),
'SRC', 'TGT', ['VOT', 'RES', 'YEA', 'DAT'], create_using=nx.Graph)
Describe the network
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
# Basic information about the network
print(nx.info(G))
# List of nodes with attributes
for node, data in G.nodes(data=True):
print(f"Node {node}: {data}")
# List of edges with attributes
for edge, data in G.edges(data=True):
print(f"Edge {edge}: {data}")
# in-degree and out-degree (for DiGraph)
print(sorted(dict(G_.out_degree()).values()))
# Helper function for printing various graph properties
def describe_graph(G):
print(G)
if nx.is_connected(G):
print("Avg. Shortest Path Length: %.4f" %nx.average_shortest_path_length(G))
print("Diameter: %.4f" %nx.diameter(G)) # Longest shortest path
else:
print("Graph is not connected")
print("Diameter and Avg shortest path length are not defined!")
print("Sparsity: %.4f" %nx.density(G)) # #edges/#edges-complete-graph
# #closed-triplets(3*#triangles)/#all-triplets
print("Global clustering coefficient aka Transitivity: %.4f" %nx.transitivity(G))
Get attributes
1
2
3
4
5
6
7
8
9
10
11
# iterate over attributes
for node, attr in G.nodes(data=True):
# use node and attr
for u, v, attr in G.edges(data=True):
# use u, v, and attr
# !! for multiDiGraph, G.edges funtion return triples (u,v,k)
# !! k represents the k-th time of the multi egde
# filter network based on attributes
edges_2004 = [i for i, v in nx.get_edge_attributes(G, "YEA").items() if v == 2004]
Subgraph
1 2 3 4 5 6 7 8 9
```python # edge subgraph edges_2004 = [i for i, v in nx.get_edge_attributes(G, "YEA").items() if v == 2004] G_2004 = G.edge_subgraph(edges_2004) # node subgraph nodes_2004 = [n for n, attr in G.nodes(data=True) if attr.get('Year') == 2004] G_2004 = G.subgraph(nodes_2004) ```
Visualize the network
Network itself
1 2 3 4 5 6 7 8 9 10
import matplotlib.pyplot as plt # Basic visualization nx.draw(G, with_labels=True, font_weight='bold') plt.show() # More customized visualization pos = nx.spring_layout(G) # You can use other layout algorithms nx.draw(G, pos, with_labels=True, font_weight='bold', node_size=700, node_color='skyblue', font_size=8, edge_color='gray', linewidths=0.5) plt.show()
Plot the degree distribution (CDF or CCDF)
1 2 3 4 5 6
sns.ecdfplot(list(dict(G.degree()).values()), complementary=True) plt.xscale("log") # plt.axvline(10) # plt.axhline(0.4) plt.title("Complementary CDF") plt.xlabel("Degree centrality")
Degree distribution, but the X-axis is the n-th data point
1 2 3 4 5 6 7 8 9 10 11 12
indegree = sorted(dict(G.in_degree()).values(), reverse=True) outdegree = sorted(dict(G.out_degree()).values(), reverse=True) indegree = np.array(indegree) outdegree = np.array(outdegree) hired_percentage = indegree / sum(indegree) output_percentage = outdegree / sum(outdegree) plt.plot(hired_percentage.cumsum(), label='Percentage of students hired by the N universities that hire most') plt.plot(output_percentage.cumsum(), label='Percentage of students output by the N universities that output most') plt.legend() plt.show()
Calculate metrics
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
# Sparsity of the network sparsity = nx.density(G) print(f"Network Sparsity: {sparsity}") # Node degree centrality degree_centrality = nx.degree_centrality(G) print(f"Node Degree Centrality: {degree_centrality}") # Edge betweenness centrality edge_betweenness = nx.edge_betweenness_centrality(G) print(f"Edge Betweenness Centrality: {edge_betweenness}") # Clustering coefficient clustering_coefficient = nx.average_clustering(G) print(f"Average Clustering Coefficient: {clustering_coefficient}")