python data-science cluster-analysis biopython

clustering of a fasta file having DNA sequences to find the most unmatched clone

I am trying to make a cluster to analyze DNA sequences and find the less matched patterns among them (for say <25% match). Is it possible to perform cluster analysis (k-means or any other approach) for the DNA in a fasta file.

for example I want to generate a cluster graph (image) for the given data (below is the fasta sequences) and find the less similar (<25%) sequence (encircled dots)

>1
 GATGTACTTCGTTCAGTTACATTGCTGCTCAAGGAGATTTTCAACGTGAAAAAAATTATTATTCGCAATTCCTTTAGTTGTTCCTT
 >2
 CAGTGTACTTCGTTCAGTTACGTATTGCTGCTCAAGGAGATTTTCAACGTGAAAAAATTATTATTCGCAATTCCTTTAGTTTGTTC
 >3
 CGGTATTACTTCGTTCAGTTACGTATTATGCTCGAAAGGAATTTCTATTGAAAGGTATTGCAATTCCTTTAGTTGTTCCTTTCTAT
 >4
 CGGTGTACTTCGTTCAGTTACGTATTGCTGCTCAAGGAGATTTTCAACGTGAAAAAAATTATTATTCGCAATTCCTTTAGTTGTTC
 >5
 GATGTACTTCGTTCCAGTTGTGTGTGCTGCTCAAGGAGATTTTTCAACGTGAAAAAATTATTATTCGCAATTCAACTTTGAATTTG
 >6
 CAATGTACTTCGTTCGGTTACGTATTGCTGCTCAAGAGATTTTCAACGTGAAAAAAATTATTATTCGCAATTCCTTTAGTTGTTCC
 >7
 CAAACACTTCGTTCAGTTACGTATTGCTGCTCAAGGAGATTTTCAACGTGAAAAAAATTATTATTCGCAATTCCTTTAGTTGTTCC

Solution

from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# Function to convert DNA sequence to one-hot encoding
def one_hot_encode(sequence):
    mapping = {'A': [1, 0, 0, 0], 'C': [0, 1, 0, 0], 'G': [0, 0, 1, 0], 'T': [0, 0, 0, 1]}
    return [mapping[base] for base in sequence]

# DNA sequences in fasta format
sequences = {
    ">1": "GATGTACTTCGTTCAGTTACATTGCTGCTCAAGGAGATTTTCAACGTGAAAAAAATTATTATTCGCAATTCCTTTAGTTGTTCCTT",
    ">2": "CAGTGTACTTCGTTCAGTTACGTATTGCTGCTCAAGGAGATTTTCAACGTGAAAAAATTATTATTCGCAATTCCTTTAGTTTGTTC",
    ">3": "CGGTATTACTTCGTTCAGTTACGTATTATGCTCGAAAGGAATTTCTATTGAAAGGTATTGCAATTCCTTTAGTTGTTCCTTTCTAT",
    ">4": "CGGTGTACTTCGTTCAGTTACGTATTGCTGCTCAAGGAGATTTTCAACGTGAAAAAAATTATTATTCGCAATTCCTTTAGTTGTTC",
    ">5": "GATGTACTTCGTTCCAGTTGTGTGTGCTGCTCAAGGAGATTTTTCAACGTGAAAAAATTATTATTCGCAATTCAACTTTGAATTTG",
    ">6": "CAATGTACTTCGTTCGGTTACGTATTGCTGCTCAAGAGATTTTCAACGTGAAAAAAATTATTATTCGCAATTCCTTTAGTTGTTCC",
    ">7": "CAAACACTTCGTTCAGTTACGTATTGCTGCTCAAGGAGATTTTCAACGTGAAAAAAATTATTATTCGCAATTCCTTTAGTTGTTCC"
}

# Assign different colors to sequences
colors = ['red', 'blue', 'green', 'orange', 'purple', 'brown', 'pink']

# One-hot encode sequences
sequences_matrix = np.array([np.concatenate(one_hot_encode(seq)) for seq in sequences.values()])

# Perform PCA
pca = PCA(n_components=2)
pca_result = pca.fit_transform(sequences_matrix)

# Plot PCA results with different colors
plt.figure(figsize=(10, 6))
for i, seq_key in enumerate(sequences.keys()):
    plt.scatter(pca_result[i, 0], pca_result[i, 1], color=colors[i], label=seq_key)

plt.title('PCA of DNA Sequences')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend()
plt.show()

u can also cluster as dendrogram

import numpy as np
from scipy.cluster.hierarchy import dendrogram, linkage
import matplotlib.pyplot as plt

# Function to calculate Hamming distance between two DNA sequences
def hamming_distance(seq1, seq2):
    return sum(c1 != c2 for c1, c2 in zip(seq1, seq2))

# DNA sequences in fasta format
sequences = {
    ">1": "GATGTACTTCGTTCAGTTACATTGCTGCTCAAGGAGATTTTCAACGTGAAAAAAATTATTATTCGCAATTCCTTTAGTTGTTCCTT",
    ">2": "CAGTGTACTTCGTTCAGTTACGTATTGCTGCTCAAGGAGATTTTCAACGTGAAAAAATTATTATTCGCAATTCCTTTAGTTTGTTC",
    ">3": "CGGTATTACTTCGTTCAGTTACGTATTATGCTCGAAAGGAATTTCTATTGAAAGGTATTGCAATTCCTTTAGTTGTTCCTTTCTAT",
    ">4": "CGGTGTACTTCGTTCAGTTACGTATTGCTGCTCAAGGAGATTTTCAACGTGAAAAAAATTATTATTCGCAATTCCTTTAGTTGTTC",
    ">5": "GATGTACTTCGTTCCAGTTGTGTGTGCTGCTCAAGGAGATTTTTCAACGTGAAAAAATTATTATTCGCAATTCAACTTTGAATTTG",
    ">6": "CAATGTACTTCGTTCGGTTACGTATTGCTGCTCAAGAGATTTTCAACGTGAAAAAAATTATTATTCGCAATTCCTTTAGTTGTTCC",
    ">7": "CAAACACTTCGTTCAGTTACGTATTGCTGCTCAAGGAGATTTTCAACGTGAAAAAAATTATTATTCGCAATTCCTTTAGTTGTTCC"
}

# Create a matrix of pairwise Hamming distances
num_sequences = len(sequences)
distance_matrix = np.zeros((num_sequences, num_sequences))

for i, seq1_key in enumerate(sequences.keys()):
    for j, seq2_key in enumerate(sequences.keys()):
        seq1 = sequences[seq1_key]
        seq2 = sequences[seq2_key]
        distance_matrix[i, j] = hamming_distance(seq1, seq2)

# Perform hierarchical clustering
linkage_matrix = linkage(distance_matrix, method='complete')

# Plot the dendrogram
labels = list(sequences.keys())
plt.figure(figsize=(10, 6))
dendrogram(linkage_matrix, labels=labels, orientation='right')
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Hamming Distance')
plt.show()

try this

from Bio import SeqIO
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import numpy as np

# Example DNA sequences in a FASTA file
example_fasta = ">1\nGATGTACTTCGTTCAGTTACATTGCTGCTCAAGGAGATTTTCAACGTGAAAAAAATTATTATTCGCAATTCCTTTAGTTGTTCCTT\n" \
                ">2\nCAGTGTACTTCGTTCAGTTACGTATTGCTGCTCAAGGAGATTTTCAACGTGAAAAAATTATTATTCGCAATTCCTTTAGTTTGTTC\n" \
                ">3\nCGGTATTACTTCGTTCAGTTACGTATTATGCTCGAAAGGAATTTCTATTGAAAGGTATTGCAATTCCTTTAGTTGTTCCTTTCTAT\n" \
                ">4\nCGGTGTACTTCGTTCAGTTACGTATTGCTGCTCAAGGAGATTTTCAACGTGAAAAAAATTATTATTCGCAATTCCTTTAGTTGTTC\n" \
                ">5\nGATGTACTTCGTTCCAGTTGTGTGTGCTGCTCAAGGAGATTTTTCAACGTGAAAAAATTATTATTCGCAATTCAACTTTGAATTTG\n" \
                ">6\nCAATGTACTTCGTTCGGTTACGTATTGCTGCTCAAGAGATTTTCAACGTGAAAAAAATTATTATTCGCAATTCCTTTAGTTGTTCC\n" \
                ">7\nCAAACACTTCGTTCAGTTACGTATTGCTGCTCAAGGAGATTTTCAACGTGAAAAAAATTATTATTCGCAATTCCTTTAGTTGTTCC"

# Save example data to a FASTA file
fasta_file = "example_fasta.fasta"
with open(fasta_file, "w") as f:
    f.write(example_fasta)

# Read DNA sequences from the FASTA file
sequences = list(SeqIO.parse(fasta_file, "fasta"))
dna_sequences = [str(seq.seq).upper() for seq in sequences]

# Define a function to convert DNA sequences to numerical representation
def dna_to_num(seq):
    return np.array([ord(base) for base in seq])

# Convert DNA sequences to numerical data
dna_data = np.array([dna_to_num(seq) for seq in dna_sequences])

# Apply PCA for dimensionality reduction
pca = PCA(n_components=2)
pca_data = pca.fit_transform(dna_data)

unique_labels = set(kmeans.labels_)
print(f"Number of clusters: {len(unique_labels)}")

# Apply KMeans clustering
kmeans = KMeans(n_clusters=4, random_state=42)
kmeans.fit(pca_data)

# Plot the clustered data
# Plot the clustered data
import matplotlib.pyplot as plt
from itertools import cycle

# Dynamically generate colors based on the number of clusters
colors = cycle(['r', 'g', 'b', 'c', 'm'])

for i, label in enumerate(set(kmeans.labels_)):
    cluster_points = np.array([pca_data[j] for j, l in enumerate(kmeans.labels_) if l == label])
    plt.scatter(cluster_points[:, 0], cluster_points[:, 1], color=next(colors), label=f'Cluster {label}')

plt.legend()
plt.show()
# Plot the clustered data with sequence annotations
import matplotlib.pyplot as plt
from itertools import cycle

# Dynamically generate colors based on the number of clusters
colors = cycle(['r', 'g', 'b', 'c', 'm'])

for i, label in enumerate(set(kmeans.labels_)):
    cluster_points = np.array([pca_data[j] for j, l in enumerate(kmeans.labels_) if l == label])
    plt.scatter(cluster_points[:, 0], cluster_points[:, 1], color=next(colors), label=f'Cluster {label}')

    # Annotate points with sequence identifiers
    for j, seq_label in enumerate(sequences):
        if kmeans.labels_[j] == label:
            plt.annotate(seq_label.id, (pca_data[j, 0], pca_data[j, 1]), textcoords="offset points", xytext=(5,5), ha='right')

plt.legend()
plt.show()

U can also label based on threshold

from Bio import SeqIO
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import numpy as np
import matplotlib.pyplot as plt

# Example DNA sequences in a FASTA file
example_fasta = ">1\nGATGTACTTCGTTCAGTTACATTGCTGCTCAAGGAGATTTTCAACGTGAAAAAAATTATTATTCGCAATTCCTTTAGTTGTTCCTT\n" \
                ">2\nCAGTGTACTTCGTTCAGTTACGTATTGCTGCTCAAGGAGATTTTCAACGTGAAAAAATTATTATTCGCAATTCCTTTAGTTTGTTC\n" \
                ">3\nCGGTATTACTTCGTTCAGTTACGTATTATGCTCGAAAGGAATTTCTATTGAAAGGTATTGCAATTCCTTTAGTTGTTCCTTTCTAT\n" \
                ">4\nCGGTGTACTTCGTTCAGTTACGTATTGCTGCTCAAGGAGATTTTCAACGTGAAAAAAATTATTATTCGCAATTCCTTTAGTTGTTC\n" \
                ">5\nGATGTACTTCGTTCCAGTTGTGTGTGCTGCTCAAGGAGATTTTTCAACGTGAAAAAATTATTATTCGCAATTCAACTTTGAATTTG\n" \
                ">6\nCAATGTACTTCGTTCGGTTACGTATTGCTGCTCAAGAGATTTTCAACGTGAAAAAAATTATTATTCGCAATTCCTTTAGTTGTTCC\n" \
                ">7\nCAAACACTTCGTTCAGTTACGTATTGCTGCTCAAGGAGATTTTCAACGTGAAAAAAATTATTATTCGCAATTCCTTTAGTTGTTCC"

# Save example data to a FASTA file
fasta_file = "example_fasta.fasta"
with open(fasta_file, "w") as f:
    f.write(example_fasta)

# Read DNA sequences from the FASTA file
sequences = list(SeqIO.parse(fasta_file, "fasta"))
dna_sequences = [str(seq.seq).upper() for seq in sequences]

# Map DNA bases to numerical values
base_mapping = {'A': 0, 'C': 1, 'G': 2, 'T': 3} #I shifted here befor def dna_to_num, use any one def, and avoid repetation

def dna_to_num(seq):
    return np.array([base_mapping.get(base, -1) for base in seq if base in base_mapping])

dna_data = np.array([dna_to_num(seq) for seq in dna_sequences])




def dna_to_num(seq):
    return np.array([base_mapping[base] for base in seq])

dna_data = np.array([dna_to_num(seq) for seq in dna_sequences])

# Perform PCA
pca = PCA(n_components=2)
pca_data = pca.fit_transform(dna_data)

# Perform KMeans clustering
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(pca_data)

# ... (previous code)

# Set the threshold for labeling
labeling_threshold = 0.75


colors = ['r', 'g', 'b', 'c', 'm']
for i, label in enumerate(set(kmeans.labels_)):
    cluster_points = np.array([pca_data[j] for j, l in enumerate(kmeans.labels_) if l == label])
    plt.scatter(cluster_points[:, 0], cluster_points[:, 1], color=colors[i], label=f'Cluster {label}')

    # Annotate points with sequence identifiers if similarity is below the threshold
    for j, seq_label in enumerate(sequences):
        if kmeans.labels_[j] == label:
            similarity_to_centroid = np.linalg.norm(pca_data[j] - kmeans.cluster_centers_[label])
            if similarity_to_centroid < labeling_threshold:
                plt.annotate(seq_label.id, (pca_data[j, 0], pca_data[j, 1]), textcoords="offset points", xytext=(5,5), ha='right')

plt.legend()
plt.show()

or if u want to display only certain based on threshold

    # Map DNA bases to numerical values
base_mapping = {'A': 0, 'C': 1, 'G': 2, 'T': 3}

# Read DNA sequences from the FASTA file
sequences = list(SeqIO.parse(fasta_file, "fasta"))
dna_sequences = [str(seq.seq).upper() for seq in sequences]

def dna_to_num(seq):
    return np.array([base_mapping.get(base, -1) for base in seq])

dna_data = np.array([dna_to_num(seq) for seq in dna_sequences])

# Perform PCA
pca = PCA(n_components=2)
pca_data = pca.fit_transform(dna_data)

# Perform KMeans clustering
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(pca_data)

# Set the threshold for labeling
labeling_threshold = 0.75

colors = ['r', 'g', 'b', 'c', 'm']
for i, label in enumerate(set(kmeans.labels_)):
    # Annotate points with sequence identifiers if similarity is below the threshold
    for j, seq_label in enumerate(sequences):
        if kmeans.labels_[j] == label:
            similarity_to_centroid = np.linalg.norm(pca_data[j] - kmeans.cluster_centers_[label])
            if similarity_to_centroid < labeling_threshold:
                # Show point only for points below the threshold
                plt.scatter(pca_data[j, 0], pca_data[j, 1], color=colors[i])
                plt.annotate(seq_label.id, (pca_data[j, 0], pca_data[j, 1]), textcoords="offset points", xytext=(5,5), ha='right')

plt.axis('equal')  # Make sure the plot is in equal scale
plt.show()

if u want to encircle threshold

import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

# ... (previous code)

# Set the threshold for labeling
labeling_threshold = 0.75

colors = ['r', 'g', 'b', 'c', 'm']

for i, label in enumerate(set(kmeans.labels_)):
    cluster_points = np.array([pca_data[j] for j, l in enumerate(kmeans.labels_) if l == label])

    # Annotate points with sequence identifiers if similarity is below the threshold
    in_threshold_points = []
    for j, seq_label in enumerate(sequences):
        if kmeans.labels_[j] == label:
            similarity_to_centroid = np.linalg.norm(pca_data[j] - kmeans.cluster_centers_[label])
            if similarity_to_centroid < labeling_threshold:
                in_threshold_points.append((pca_data[j, 0], pca_data[j, 1]))
                plt.annotate(seq_label.id, (pca_data[j, 0], pca_data[j, 1]), textcoords="offset points", xytext=(5, 5), ha='right')

    # Draw circles around the points within the threshold
    if len(in_threshold_points) > 0:
        for point in in_threshold_points:
            circle = plt.Circle((point[0], point[1]), labeling_threshold, color='black', fill=False)
            plt.gca().add_patch(circle)

    plt.scatter(cluster_points[:, 0], cluster_points[:, 1], color=colors[i], label=f'Cluster {label}')

plt.legend()
plt.show()