pandas
python module and name it pd
data
.data
.data
table.peak
.peak
.peak
table.import pandas as pd
data = pd.read_excel('data.xlsx', sheet_name='Data')
print("Data Table: {} rows & {} columns".format(*data.shape))
display(data.head(10)) # View data table (top 10 rows)
peak = pd.read_excel('data.xlsx', sheet_name='Peak')
print("Peak Table: {} rows & {} columns".format(*peak.shape))
display(peak.head(10))
matplotlib.pyplot
visualisation package and name it plt
%matplotlib inline
to help simplify plottingpeak.RSD
plt.show()
import matplotlib.pyplot as plt
%matplotlib inline
plt.hist(peak.RSD, 50, density=True, facecolor='g', alpha=0.5)
plt.xlabel('RSD', fontsize=15)
plt.show()
seaborn
visualisation package and name it sns
peak
columns peak.RSD
vs. peak.D-Ratio
with bivariate and distribution graphs. (automatically displayed)import seaborn as sns
sns.jointplot(x=peak.RSD, y=peak.D_Ratio, kind='kde', color="skyblue")
numpy
(import as np
)scikit-learn
packages sklearn.decomposition.PCA
, sklearn.preprocessing.StandardScaler
matplotlib.colors.ListedColormap
# Import
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from matplotlib.colors import ListedColormap
# Extract X matrix
names = peak['Name']
x = data[names].values
x = np.log(x)
x = StandardScaler().fit_transform(x)
# Create and fit PCA
pca = PCA(n_components=2)
scores = pca.fit_transform(x)
label = data['SampleType']
# Split scores into sample and QC
Sample_scores = scores[label == 'Sample',:]
QC_scores = scores[label == 'QC',:]
# Plot Sample score and QC score
fig = plt.figure(figsize=(8,8))
h1 = plt.scatter(Sample_scores[:,0],Sample_scores[:,1],edgecolors='Black', facecolors='Green',s=100,alpha=0.5)
h2 = plt.scatter(QC_scores[:,0],QC_scores[:,1], edgecolors='Black', facecolors='Red',s=100,alpha=0.5)
# Add legend, labels, and title
plt.legend((h1,h2),('Sample','QC'),fontsize=15)
plt.xlabel('PC1', fontsize=15)
plt.ylabel('PC2', fontsize=15)
plt.title('Quality Control PCA plot',fontsize=20)
# Show plot
plt.show()
peak
columns: peak.Mol_Weight
vs. peak.RT_minutes
(each dot is a metabolite peak).s=peak.RSD**2/2
alpha=0.2
# Scatterplot of Mol_Weight vs. RT_minute with size RSD^2/2, and colour red
fig = plt.figure(figsize=(20,16))
plt.scatter(peak.Mol_Weight, peak.RT_minutes, s=peak.RSD**2/2, alpha=0.2, edgecolors='black', c='red')
plt.xlabel('Molecular Weight', fontsize=15)
plt.ylabel('RT minutes', fontsize=15)
plt.title('Metabolites Detected (sized by RSD)',fontsize=20)
plt.show()