I am trying to make a cluster to analyze DNA sequences and find the less matched patterns among them (for say <25% match). Is it possible to perform cluster analysis (k-means or any other approach) for the DNA in a fasta file.
for example I want to generate a cluster graph (image) for the given data (below is the fasta sequences) and find the less similar (<25%) sequence (encircled dots)
>1
GATGTACTTCGTTCAGTTACATTGCTGCTCAAGGAGATTTTCAACGTGAAAAAAATTATTATTCGCAATTCCTTTAGTTGTTCCTT
>2
CAGTGTACTTCGTTCAGTTACGTATTGCTGCTCAAGGAGATTTTCAACGTGAAAAAATTATTATTCGCAATTCCTTTAGTTTGTTC
>3
CGGTATTACTTCGTTCAGTTACGTATTATGCTCGAAAGGAATTTCTATTGAAAGGTATTGCAATTCCTTTAGTTGTTCCTTTCTAT
>4
CGGTGTACTTCGTTCAGTTACGTATTGCTGCTCAAGGAGATTTTCAACGTGAAAAAAATTATTATTCGCAATTCCTTTAGTTGTTC
>5
GATGTACTTCGTTCCAGTTGTGTGTGCTGCTCAAGGAGATTTTTCAACGTGAAAAAATTATTATTCGCAATTCAACTTTGAATTTG
>6
CAATGTACTTCGTTCGGTTACGTATTGCTGCTCAAGAGATTTTCAACGTGAAAAAAATTATTATTCGCAATTCCTTTAGTTGTTCC
>7
CAAACACTTCGTTCAGTTACGTATTGCTGCTCAAGGAGATTTTCAACGTGAAAAAAATTATTATTCGCAATTCCTTTAGTTGTTCC
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
# Function to convert DNA sequence to one-hot encoding
def one_hot_encode(sequence):
mapping = {'A': [1, 0, 0, 0], 'C': [0, 1, 0, 0], 'G': [0, 0, 1, 0], 'T': [0, 0, 0, 1]}
return [mapping[base] for base in sequence]
# DNA sequences in fasta format
sequences = {
">1": "GATGTACTTCGTTCAGTTACATTGCTGCTCAAGGAGATTTTCAACGTGAAAAAAATTATTATTCGCAATTCCTTTAGTTGTTCCTT",
">2": "CAGTGTACTTCGTTCAGTTACGTATTGCTGCTCAAGGAGATTTTCAACGTGAAAAAATTATTATTCGCAATTCCTTTAGTTTGTTC",
">3": "CGGTATTACTTCGTTCAGTTACGTATTATGCTCGAAAGGAATTTCTATTGAAAGGTATTGCAATTCCTTTAGTTGTTCCTTTCTAT",
">4": "CGGTGTACTTCGTTCAGTTACGTATTGCTGCTCAAGGAGATTTTCAACGTGAAAAAAATTATTATTCGCAATTCCTTTAGTTGTTC",
">5": "GATGTACTTCGTTCCAGTTGTGTGTGCTGCTCAAGGAGATTTTTCAACGTGAAAAAATTATTATTCGCAATTCAACTTTGAATTTG",
">6": "CAATGTACTTCGTTCGGTTACGTATTGCTGCTCAAGAGATTTTCAACGTGAAAAAAATTATTATTCGCAATTCCTTTAGTTGTTCC",
">7": "CAAACACTTCGTTCAGTTACGTATTGCTGCTCAAGGAGATTTTCAACGTGAAAAAAATTATTATTCGCAATTCCTTTAGTTGTTCC"
}
# Assign different colors to sequences
colors = ['red', 'blue', 'green', 'orange', 'purple', 'brown', 'pink']
# One-hot encode sequences
sequences_matrix = np.array([np.concatenate(one_hot_encode(seq)) for seq in sequences.values()])
# Perform PCA
pca = PCA(n_components=2)
pca_result = pca.fit_transform(sequences_matrix)
# Plot PCA results with different colors
plt.figure(figsize=(10, 6))
for i, seq_key in enumerate(sequences.keys()):
plt.scatter(pca_result[i, 0], pca_result[i, 1], color=colors[i], label=seq_key)
plt.title('PCA of DNA Sequences')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend()
plt.show()
u can also cluster as dendrogram
import numpy as np
from scipy.cluster.hierarchy import dendrogram, linkage
import matplotlib.pyplot as plt
# Function to calculate Hamming distance between two DNA sequences
def hamming_distance(seq1, seq2):
return sum(c1 != c2 for c1, c2 in zip(seq1, seq2))
# DNA sequences in fasta format
sequences = {
">1": "GATGTACTTCGTTCAGTTACATTGCTGCTCAAGGAGATTTTCAACGTGAAAAAAATTATTATTCGCAATTCCTTTAGTTGTTCCTT",
">2": "CAGTGTACTTCGTTCAGTTACGTATTGCTGCTCAAGGAGATTTTCAACGTGAAAAAATTATTATTCGCAATTCCTTTAGTTTGTTC",
">3": "CGGTATTACTTCGTTCAGTTACGTATTATGCTCGAAAGGAATTTCTATTGAAAGGTATTGCAATTCCTTTAGTTGTTCCTTTCTAT",
">4": "CGGTGTACTTCGTTCAGTTACGTATTGCTGCTCAAGGAGATTTTCAACGTGAAAAAAATTATTATTCGCAATTCCTTTAGTTGTTC",
">5": "GATGTACTTCGTTCCAGTTGTGTGTGCTGCTCAAGGAGATTTTTCAACGTGAAAAAATTATTATTCGCAATTCAACTTTGAATTTG",
">6": "CAATGTACTTCGTTCGGTTACGTATTGCTGCTCAAGAGATTTTCAACGTGAAAAAAATTATTATTCGCAATTCCTTTAGTTGTTCC",
">7": "CAAACACTTCGTTCAGTTACGTATTGCTGCTCAAGGAGATTTTCAACGTGAAAAAAATTATTATTCGCAATTCCTTTAGTTGTTCC"
}
# Create a matrix of pairwise Hamming distances
num_sequences = len(sequences)
distance_matrix = np.zeros((num_sequences, num_sequences))
for i, seq1_key in enumerate(sequences.keys()):
for j, seq2_key in enumerate(sequences.keys()):
seq1 = sequences[seq1_key]
seq2 = sequences[seq2_key]
distance_matrix[i, j] = hamming_distance(seq1, seq2)
# Perform hierarchical clustering
linkage_matrix = linkage(distance_matrix, method='complete')
# Plot the dendrogram
labels = list(sequences.keys())
plt.figure(figsize=(10, 6))
dendrogram(linkage_matrix, labels=labels, orientation='right')
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Hamming Distance')
plt.show()
try this
from Bio import SeqIO
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import numpy as np
# Example DNA sequences in a FASTA file
example_fasta = ">1\nGATGTACTTCGTTCAGTTACATTGCTGCTCAAGGAGATTTTCAACGTGAAAAAAATTATTATTCGCAATTCCTTTAGTTGTTCCTT\n" \
">2\nCAGTGTACTTCGTTCAGTTACGTATTGCTGCTCAAGGAGATTTTCAACGTGAAAAAATTATTATTCGCAATTCCTTTAGTTTGTTC\n" \
">3\nCGGTATTACTTCGTTCAGTTACGTATTATGCTCGAAAGGAATTTCTATTGAAAGGTATTGCAATTCCTTTAGTTGTTCCTTTCTAT\n" \
">4\nCGGTGTACTTCGTTCAGTTACGTATTGCTGCTCAAGGAGATTTTCAACGTGAAAAAAATTATTATTCGCAATTCCTTTAGTTGTTC\n" \
">5\nGATGTACTTCGTTCCAGTTGTGTGTGCTGCTCAAGGAGATTTTTCAACGTGAAAAAATTATTATTCGCAATTCAACTTTGAATTTG\n" \
">6\nCAATGTACTTCGTTCGGTTACGTATTGCTGCTCAAGAGATTTTCAACGTGAAAAAAATTATTATTCGCAATTCCTTTAGTTGTTCC\n" \
">7\nCAAACACTTCGTTCAGTTACGTATTGCTGCTCAAGGAGATTTTCAACGTGAAAAAAATTATTATTCGCAATTCCTTTAGTTGTTCC"
# Save example data to a FASTA file
fasta_file = "example_fasta.fasta"
with open(fasta_file, "w") as f:
f.write(example_fasta)
# Read DNA sequences from the FASTA file
sequences = list(SeqIO.parse(fasta_file, "fasta"))
dna_sequences = [str(seq.seq).upper() for seq in sequences]
# Define a function to convert DNA sequences to numerical representation
def dna_to_num(seq):
return np.array([ord(base) for base in seq])
# Convert DNA sequences to numerical data
dna_data = np.array([dna_to_num(seq) for seq in dna_sequences])
# Apply PCA for dimensionality reduction
pca = PCA(n_components=2)
pca_data = pca.fit_transform(dna_data)
unique_labels = set(kmeans.labels_)
print(f"Number of clusters: {len(unique_labels)}")
# Apply KMeans clustering
kmeans = KMeans(n_clusters=4, random_state=42)
kmeans.fit(pca_data)
# Plot the clustered data
# Plot the clustered data
import matplotlib.pyplot as plt
from itertools import cycle
# Dynamically generate colors based on the number of clusters
colors = cycle(['r', 'g', 'b', 'c', 'm'])
for i, label in enumerate(set(kmeans.labels_)):
cluster_points = np.array([pca_data[j] for j, l in enumerate(kmeans.labels_) if l == label])
plt.scatter(cluster_points[:, 0], cluster_points[:, 1], color=next(colors), label=f'Cluster {label}')
plt.legend()
plt.show()
# Plot the clustered data with sequence annotations
import matplotlib.pyplot as plt
from itertools import cycle
# Dynamically generate colors based on the number of clusters
colors = cycle(['r', 'g', 'b', 'c', 'm'])
for i, label in enumerate(set(kmeans.labels_)):
cluster_points = np.array([pca_data[j] for j, l in enumerate(kmeans.labels_) if l == label])
plt.scatter(cluster_points[:, 0], cluster_points[:, 1], color=next(colors), label=f'Cluster {label}')
# Annotate points with sequence identifiers
for j, seq_label in enumerate(sequences):
if kmeans.labels_[j] == label:
plt.annotate(seq_label.id, (pca_data[j, 0], pca_data[j, 1]), textcoords="offset points", xytext=(5,5), ha='right')
plt.legend()
plt.show()
U can also label based on threshold
from Bio import SeqIO
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import numpy as np
import matplotlib.pyplot as plt
# Example DNA sequences in a FASTA file
example_fasta = ">1\nGATGTACTTCGTTCAGTTACATTGCTGCTCAAGGAGATTTTCAACGTGAAAAAAATTATTATTCGCAATTCCTTTAGTTGTTCCTT\n" \
">2\nCAGTGTACTTCGTTCAGTTACGTATTGCTGCTCAAGGAGATTTTCAACGTGAAAAAATTATTATTCGCAATTCCTTTAGTTTGTTC\n" \
">3\nCGGTATTACTTCGTTCAGTTACGTATTATGCTCGAAAGGAATTTCTATTGAAAGGTATTGCAATTCCTTTAGTTGTTCCTTTCTAT\n" \
">4\nCGGTGTACTTCGTTCAGTTACGTATTGCTGCTCAAGGAGATTTTCAACGTGAAAAAAATTATTATTCGCAATTCCTTTAGTTGTTC\n" \
">5\nGATGTACTTCGTTCCAGTTGTGTGTGCTGCTCAAGGAGATTTTTCAACGTGAAAAAATTATTATTCGCAATTCAACTTTGAATTTG\n" \
">6\nCAATGTACTTCGTTCGGTTACGTATTGCTGCTCAAGAGATTTTCAACGTGAAAAAAATTATTATTCGCAATTCCTTTAGTTGTTCC\n" \
">7\nCAAACACTTCGTTCAGTTACGTATTGCTGCTCAAGGAGATTTTCAACGTGAAAAAAATTATTATTCGCAATTCCTTTAGTTGTTCC"
# Save example data to a FASTA file
fasta_file = "example_fasta.fasta"
with open(fasta_file, "w") as f:
f.write(example_fasta)
# Read DNA sequences from the FASTA file
sequences = list(SeqIO.parse(fasta_file, "fasta"))
dna_sequences = [str(seq.seq).upper() for seq in sequences]
# Map DNA bases to numerical values
base_mapping = {'A': 0, 'C': 1, 'G': 2, 'T': 3} #I shifted here befor def dna_to_num, use any one def, and avoid repetation
def dna_to_num(seq):
return np.array([base_mapping.get(base, -1) for base in seq if base in base_mapping])
dna_data = np.array([dna_to_num(seq) for seq in dna_sequences])
def dna_to_num(seq):
return np.array([base_mapping[base] for base in seq])
dna_data = np.array([dna_to_num(seq) for seq in dna_sequences])
# Perform PCA
pca = PCA(n_components=2)
pca_data = pca.fit_transform(dna_data)
# Perform KMeans clustering
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(pca_data)
# ... (previous code)
# Set the threshold for labeling
labeling_threshold = 0.75
colors = ['r', 'g', 'b', 'c', 'm']
for i, label in enumerate(set(kmeans.labels_)):
cluster_points = np.array([pca_data[j] for j, l in enumerate(kmeans.labels_) if l == label])
plt.scatter(cluster_points[:, 0], cluster_points[:, 1], color=colors[i], label=f'Cluster {label}')
# Annotate points with sequence identifiers if similarity is below the threshold
for j, seq_label in enumerate(sequences):
if kmeans.labels_[j] == label:
similarity_to_centroid = np.linalg.norm(pca_data[j] - kmeans.cluster_centers_[label])
if similarity_to_centroid < labeling_threshold:
plt.annotate(seq_label.id, (pca_data[j, 0], pca_data[j, 1]), textcoords="offset points", xytext=(5,5), ha='right')
plt.legend()
plt.show()
or if u want to display only certain based on threshold
# Map DNA bases to numerical values
base_mapping = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
# Read DNA sequences from the FASTA file
sequences = list(SeqIO.parse(fasta_file, "fasta"))
dna_sequences = [str(seq.seq).upper() for seq in sequences]
def dna_to_num(seq):
return np.array([base_mapping.get(base, -1) for base in seq])
dna_data = np.array([dna_to_num(seq) for seq in dna_sequences])
# Perform PCA
pca = PCA(n_components=2)
pca_data = pca.fit_transform(dna_data)
# Perform KMeans clustering
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(pca_data)
# Set the threshold for labeling
labeling_threshold = 0.75
colors = ['r', 'g', 'b', 'c', 'm']
for i, label in enumerate(set(kmeans.labels_)):
# Annotate points with sequence identifiers if similarity is below the threshold
for j, seq_label in enumerate(sequences):
if kmeans.labels_[j] == label:
similarity_to_centroid = np.linalg.norm(pca_data[j] - kmeans.cluster_centers_[label])
if similarity_to_centroid < labeling_threshold:
# Show point only for points below the threshold
plt.scatter(pca_data[j, 0], pca_data[j, 1], color=colors[i])
plt.annotate(seq_label.id, (pca_data[j, 0], pca_data[j, 1]), textcoords="offset points", xytext=(5,5), ha='right')
plt.axis('equal') # Make sure the plot is in equal scale
plt.show()
if u want to encircle threshold
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
# ... (previous code)
# Set the threshold for labeling
labeling_threshold = 0.75
colors = ['r', 'g', 'b', 'c', 'm']
for i, label in enumerate(set(kmeans.labels_)):
cluster_points = np.array([pca_data[j] for j, l in enumerate(kmeans.labels_) if l == label])
# Annotate points with sequence identifiers if similarity is below the threshold
in_threshold_points = []
for j, seq_label in enumerate(sequences):
if kmeans.labels_[j] == label:
similarity_to_centroid = np.linalg.norm(pca_data[j] - kmeans.cluster_centers_[label])
if similarity_to_centroid < labeling_threshold:
in_threshold_points.append((pca_data[j, 0], pca_data[j, 1]))
plt.annotate(seq_label.id, (pca_data[j, 0], pca_data[j, 1]), textcoords="offset points", xytext=(5, 5), ha='right')
# Draw circles around the points within the threshold
if len(in_threshold_points) > 0:
for point in in_threshold_points:
circle = plt.Circle((point[0], point[1]), labeling_threshold, color='black', fill=False)
plt.gca().add_patch(circle)
plt.scatter(cluster_points[:, 0], cluster_points[:, 1], color=colors[i], label=f'Cluster {label}')
plt.legend()
plt.show()