Search code examples
pythondata-sciencecluster-analysisbiopython

clustering of a fasta file having DNA sequences to find the most unmatched clone


I am trying to make a cluster to analyze DNA sequences and find the less matched patterns among them (for say <25% match). Is it possible to perform cluster analysis (k-means or any other approach) for the DNA in a fasta file.

for example I want to generate a cluster graph (image) for the given data (below is the fasta sequences) and find the less similar (<25%) sequence (encircled dots)

>1
 GATGTACTTCGTTCAGTTACATTGCTGCTCAAGGAGATTTTCAACGTGAAAAAAATTATTATTCGCAATTCCTTTAGTTGTTCCTT
 >2
 CAGTGTACTTCGTTCAGTTACGTATTGCTGCTCAAGGAGATTTTCAACGTGAAAAAATTATTATTCGCAATTCCTTTAGTTTGTTC
 >3
 CGGTATTACTTCGTTCAGTTACGTATTATGCTCGAAAGGAATTTCTATTGAAAGGTATTGCAATTCCTTTAGTTGTTCCTTTCTAT
 >4
 CGGTGTACTTCGTTCAGTTACGTATTGCTGCTCAAGGAGATTTTCAACGTGAAAAAAATTATTATTCGCAATTCCTTTAGTTGTTC
 >5
 GATGTACTTCGTTCCAGTTGTGTGTGCTGCTCAAGGAGATTTTTCAACGTGAAAAAATTATTATTCGCAATTCAACTTTGAATTTG
 >6
 CAATGTACTTCGTTCGGTTACGTATTGCTGCTCAAGAGATTTTCAACGTGAAAAAAATTATTATTCGCAATTCCTTTAGTTGTTCC
 >7
 CAAACACTTCGTTCAGTTACGTATTGCTGCTCAAGGAGATTTTCAACGTGAAAAAAATTATTATTCGCAATTCCTTTAGTTGTTCC 

enter image description here


Solution

  • from sklearn.decomposition import PCA
    import matplotlib.pyplot as plt
    
    # Function to convert DNA sequence to one-hot encoding
    def one_hot_encode(sequence):
        mapping = {'A': [1, 0, 0, 0], 'C': [0, 1, 0, 0], 'G': [0, 0, 1, 0], 'T': [0, 0, 0, 1]}
        return [mapping[base] for base in sequence]
    
    # DNA sequences in fasta format
    sequences = {
        ">1": "GATGTACTTCGTTCAGTTACATTGCTGCTCAAGGAGATTTTCAACGTGAAAAAAATTATTATTCGCAATTCCTTTAGTTGTTCCTT",
        ">2": "CAGTGTACTTCGTTCAGTTACGTATTGCTGCTCAAGGAGATTTTCAACGTGAAAAAATTATTATTCGCAATTCCTTTAGTTTGTTC",
        ">3": "CGGTATTACTTCGTTCAGTTACGTATTATGCTCGAAAGGAATTTCTATTGAAAGGTATTGCAATTCCTTTAGTTGTTCCTTTCTAT",
        ">4": "CGGTGTACTTCGTTCAGTTACGTATTGCTGCTCAAGGAGATTTTCAACGTGAAAAAAATTATTATTCGCAATTCCTTTAGTTGTTC",
        ">5": "GATGTACTTCGTTCCAGTTGTGTGTGCTGCTCAAGGAGATTTTTCAACGTGAAAAAATTATTATTCGCAATTCAACTTTGAATTTG",
        ">6": "CAATGTACTTCGTTCGGTTACGTATTGCTGCTCAAGAGATTTTCAACGTGAAAAAAATTATTATTCGCAATTCCTTTAGTTGTTCC",
        ">7": "CAAACACTTCGTTCAGTTACGTATTGCTGCTCAAGGAGATTTTCAACGTGAAAAAAATTATTATTCGCAATTCCTTTAGTTGTTCC"
    }
    
    # Assign different colors to sequences
    colors = ['red', 'blue', 'green', 'orange', 'purple', 'brown', 'pink']
    
    # One-hot encode sequences
    sequences_matrix = np.array([np.concatenate(one_hot_encode(seq)) for seq in sequences.values()])
    
    # Perform PCA
    pca = PCA(n_components=2)
    pca_result = pca.fit_transform(sequences_matrix)
    
    # Plot PCA results with different colors
    plt.figure(figsize=(10, 6))
    for i, seq_key in enumerate(sequences.keys()):
        plt.scatter(pca_result[i, 0], pca_result[i, 1], color=colors[i], label=seq_key)
    
    plt.title('PCA of DNA Sequences')
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.legend()
    plt.show()
    

    enter image description here

    u can also cluster as dendrogram

    import numpy as np
    from scipy.cluster.hierarchy import dendrogram, linkage
    import matplotlib.pyplot as plt
    
    # Function to calculate Hamming distance between two DNA sequences
    def hamming_distance(seq1, seq2):
        return sum(c1 != c2 for c1, c2 in zip(seq1, seq2))
    
    # DNA sequences in fasta format
    sequences = {
        ">1": "GATGTACTTCGTTCAGTTACATTGCTGCTCAAGGAGATTTTCAACGTGAAAAAAATTATTATTCGCAATTCCTTTAGTTGTTCCTT",
        ">2": "CAGTGTACTTCGTTCAGTTACGTATTGCTGCTCAAGGAGATTTTCAACGTGAAAAAATTATTATTCGCAATTCCTTTAGTTTGTTC",
        ">3": "CGGTATTACTTCGTTCAGTTACGTATTATGCTCGAAAGGAATTTCTATTGAAAGGTATTGCAATTCCTTTAGTTGTTCCTTTCTAT",
        ">4": "CGGTGTACTTCGTTCAGTTACGTATTGCTGCTCAAGGAGATTTTCAACGTGAAAAAAATTATTATTCGCAATTCCTTTAGTTGTTC",
        ">5": "GATGTACTTCGTTCCAGTTGTGTGTGCTGCTCAAGGAGATTTTTCAACGTGAAAAAATTATTATTCGCAATTCAACTTTGAATTTG",
        ">6": "CAATGTACTTCGTTCGGTTACGTATTGCTGCTCAAGAGATTTTCAACGTGAAAAAAATTATTATTCGCAATTCCTTTAGTTGTTCC",
        ">7": "CAAACACTTCGTTCAGTTACGTATTGCTGCTCAAGGAGATTTTCAACGTGAAAAAAATTATTATTCGCAATTCCTTTAGTTGTTCC"
    }
    
    # Create a matrix of pairwise Hamming distances
    num_sequences = len(sequences)
    distance_matrix = np.zeros((num_sequences, num_sequences))
    
    for i, seq1_key in enumerate(sequences.keys()):
        for j, seq2_key in enumerate(sequences.keys()):
            seq1 = sequences[seq1_key]
            seq2 = sequences[seq2_key]
            distance_matrix[i, j] = hamming_distance(seq1, seq2)
    
    # Perform hierarchical clustering
    linkage_matrix = linkage(distance_matrix, method='complete')
    
    # Plot the dendrogram
    labels = list(sequences.keys())
    plt.figure(figsize=(10, 6))
    dendrogram(linkage_matrix, labels=labels, orientation='right')
    plt.title('Hierarchical Clustering Dendrogram')
    plt.xlabel('Hamming Distance')
    plt.show()
    

    enter image description here

    try this

    from Bio import SeqIO
    from sklearn.decomposition import PCA
    from sklearn.cluster import KMeans
    import numpy as np
    
    # Example DNA sequences in a FASTA file
    example_fasta = ">1\nGATGTACTTCGTTCAGTTACATTGCTGCTCAAGGAGATTTTCAACGTGAAAAAAATTATTATTCGCAATTCCTTTAGTTGTTCCTT\n" \
                    ">2\nCAGTGTACTTCGTTCAGTTACGTATTGCTGCTCAAGGAGATTTTCAACGTGAAAAAATTATTATTCGCAATTCCTTTAGTTTGTTC\n" \
                    ">3\nCGGTATTACTTCGTTCAGTTACGTATTATGCTCGAAAGGAATTTCTATTGAAAGGTATTGCAATTCCTTTAGTTGTTCCTTTCTAT\n" \
                    ">4\nCGGTGTACTTCGTTCAGTTACGTATTGCTGCTCAAGGAGATTTTCAACGTGAAAAAAATTATTATTCGCAATTCCTTTAGTTGTTC\n" \
                    ">5\nGATGTACTTCGTTCCAGTTGTGTGTGCTGCTCAAGGAGATTTTTCAACGTGAAAAAATTATTATTCGCAATTCAACTTTGAATTTG\n" \
                    ">6\nCAATGTACTTCGTTCGGTTACGTATTGCTGCTCAAGAGATTTTCAACGTGAAAAAAATTATTATTCGCAATTCCTTTAGTTGTTCC\n" \
                    ">7\nCAAACACTTCGTTCAGTTACGTATTGCTGCTCAAGGAGATTTTCAACGTGAAAAAAATTATTATTCGCAATTCCTTTAGTTGTTCC"
    
    # Save example data to a FASTA file
    fasta_file = "example_fasta.fasta"
    with open(fasta_file, "w") as f:
        f.write(example_fasta)
    
    # Read DNA sequences from the FASTA file
    sequences = list(SeqIO.parse(fasta_file, "fasta"))
    dna_sequences = [str(seq.seq).upper() for seq in sequences]
    
    # Define a function to convert DNA sequences to numerical representation
    def dna_to_num(seq):
        return np.array([ord(base) for base in seq])
    
    # Convert DNA sequences to numerical data
    dna_data = np.array([dna_to_num(seq) for seq in dna_sequences])
    
    # Apply PCA for dimensionality reduction
    pca = PCA(n_components=2)
    pca_data = pca.fit_transform(dna_data)
    
    unique_labels = set(kmeans.labels_)
    print(f"Number of clusters: {len(unique_labels)}")
    
    # Apply KMeans clustering
    kmeans = KMeans(n_clusters=4, random_state=42)
    kmeans.fit(pca_data)
    
    # Plot the clustered data
    # Plot the clustered data
    import matplotlib.pyplot as plt
    from itertools import cycle
    
    # Dynamically generate colors based on the number of clusters
    colors = cycle(['r', 'g', 'b', 'c', 'm'])
    
    for i, label in enumerate(set(kmeans.labels_)):
        cluster_points = np.array([pca_data[j] for j, l in enumerate(kmeans.labels_) if l == label])
        plt.scatter(cluster_points[:, 0], cluster_points[:, 1], color=next(colors), label=f'Cluster {label}')
    
    plt.legend()
    plt.show()
    # Plot the clustered data with sequence annotations
    import matplotlib.pyplot as plt
    from itertools import cycle
    
    # Dynamically generate colors based on the number of clusters
    colors = cycle(['r', 'g', 'b', 'c', 'm'])
    
    for i, label in enumerate(set(kmeans.labels_)):
        cluster_points = np.array([pca_data[j] for j, l in enumerate(kmeans.labels_) if l == label])
        plt.scatter(cluster_points[:, 0], cluster_points[:, 1], color=next(colors), label=f'Cluster {label}')
    
        # Annotate points with sequence identifiers
        for j, seq_label in enumerate(sequences):
            if kmeans.labels_[j] == label:
                plt.annotate(seq_label.id, (pca_data[j, 0], pca_data[j, 1]), textcoords="offset points", xytext=(5,5), ha='right')
    
    plt.legend()
    plt.show()
    

    enter image description here

    U can also label based on threshold

    from Bio import SeqIO
    from sklearn.decomposition import PCA
    from sklearn.cluster import KMeans
    import numpy as np
    import matplotlib.pyplot as plt
    
    # Example DNA sequences in a FASTA file
    example_fasta = ">1\nGATGTACTTCGTTCAGTTACATTGCTGCTCAAGGAGATTTTCAACGTGAAAAAAATTATTATTCGCAATTCCTTTAGTTGTTCCTT\n" \
                    ">2\nCAGTGTACTTCGTTCAGTTACGTATTGCTGCTCAAGGAGATTTTCAACGTGAAAAAATTATTATTCGCAATTCCTTTAGTTTGTTC\n" \
                    ">3\nCGGTATTACTTCGTTCAGTTACGTATTATGCTCGAAAGGAATTTCTATTGAAAGGTATTGCAATTCCTTTAGTTGTTCCTTTCTAT\n" \
                    ">4\nCGGTGTACTTCGTTCAGTTACGTATTGCTGCTCAAGGAGATTTTCAACGTGAAAAAAATTATTATTCGCAATTCCTTTAGTTGTTC\n" \
                    ">5\nGATGTACTTCGTTCCAGTTGTGTGTGCTGCTCAAGGAGATTTTTCAACGTGAAAAAATTATTATTCGCAATTCAACTTTGAATTTG\n" \
                    ">6\nCAATGTACTTCGTTCGGTTACGTATTGCTGCTCAAGAGATTTTCAACGTGAAAAAAATTATTATTCGCAATTCCTTTAGTTGTTCC\n" \
                    ">7\nCAAACACTTCGTTCAGTTACGTATTGCTGCTCAAGGAGATTTTCAACGTGAAAAAAATTATTATTCGCAATTCCTTTAGTTGTTCC"
    
    # Save example data to a FASTA file
    fasta_file = "example_fasta.fasta"
    with open(fasta_file, "w") as f:
        f.write(example_fasta)
    
    # Read DNA sequences from the FASTA file
    sequences = list(SeqIO.parse(fasta_file, "fasta"))
    dna_sequences = [str(seq.seq).upper() for seq in sequences]
    
    # Map DNA bases to numerical values
    base_mapping = {'A': 0, 'C': 1, 'G': 2, 'T': 3} #I shifted here befor def dna_to_num, use any one def, and avoid repetation
    
    def dna_to_num(seq):
        return np.array([base_mapping.get(base, -1) for base in seq if base in base_mapping])
    
    dna_data = np.array([dna_to_num(seq) for seq in dna_sequences])
    
    
    
    
    def dna_to_num(seq):
        return np.array([base_mapping[base] for base in seq])
    
    dna_data = np.array([dna_to_num(seq) for seq in dna_sequences])
    
    # Perform PCA
    pca = PCA(n_components=2)
    pca_data = pca.fit_transform(dna_data)
    
    # Perform KMeans clustering
    kmeans = KMeans(n_clusters=5, random_state=42)
    kmeans.fit(pca_data)
    
    # ... (previous code)
    
    # Set the threshold for labeling
    labeling_threshold = 0.75
    
    
    colors = ['r', 'g', 'b', 'c', 'm']
    for i, label in enumerate(set(kmeans.labels_)):
        cluster_points = np.array([pca_data[j] for j, l in enumerate(kmeans.labels_) if l == label])
        plt.scatter(cluster_points[:, 0], cluster_points[:, 1], color=colors[i], label=f'Cluster {label}')
    
        # Annotate points with sequence identifiers if similarity is below the threshold
        for j, seq_label in enumerate(sequences):
            if kmeans.labels_[j] == label:
                similarity_to_centroid = np.linalg.norm(pca_data[j] - kmeans.cluster_centers_[label])
                if similarity_to_centroid < labeling_threshold:
                    plt.annotate(seq_label.id, (pca_data[j, 0], pca_data[j, 1]), textcoords="offset points", xytext=(5,5), ha='right')
    
    plt.legend()
    plt.show()
    

    enter image description here

    or if u want to display only certain based on threshold

        # Map DNA bases to numerical values
    base_mapping = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
    
    # Read DNA sequences from the FASTA file
    sequences = list(SeqIO.parse(fasta_file, "fasta"))
    dna_sequences = [str(seq.seq).upper() for seq in sequences]
    
    def dna_to_num(seq):
        return np.array([base_mapping.get(base, -1) for base in seq])
    
    dna_data = np.array([dna_to_num(seq) for seq in dna_sequences])
    
    # Perform PCA
    pca = PCA(n_components=2)
    pca_data = pca.fit_transform(dna_data)
    
    # Perform KMeans clustering
    kmeans = KMeans(n_clusters=5, random_state=42)
    kmeans.fit(pca_data)
    
    # Set the threshold for labeling
    labeling_threshold = 0.75
    
    colors = ['r', 'g', 'b', 'c', 'm']
    for i, label in enumerate(set(kmeans.labels_)):
        # Annotate points with sequence identifiers if similarity is below the threshold
        for j, seq_label in enumerate(sequences):
            if kmeans.labels_[j] == label:
                similarity_to_centroid = np.linalg.norm(pca_data[j] - kmeans.cluster_centers_[label])
                if similarity_to_centroid < labeling_threshold:
                    # Show point only for points below the threshold
                    plt.scatter(pca_data[j, 0], pca_data[j, 1], color=colors[i])
                    plt.annotate(seq_label.id, (pca_data[j, 0], pca_data[j, 1]), textcoords="offset points", xytext=(5,5), ha='right')
    
    plt.axis('equal')  # Make sure the plot is in equal scale
    plt.show()
    

    enter image description here

    if u want to encircle threshold

    import numpy as np
    import matplotlib.pyplot as plt
    from sklearn.decomposition import PCA
    from sklearn.cluster import KMeans
    
    # ... (previous code)
    
    # Set the threshold for labeling
    labeling_threshold = 0.75
    
    colors = ['r', 'g', 'b', 'c', 'm']
    
    for i, label in enumerate(set(kmeans.labels_)):
        cluster_points = np.array([pca_data[j] for j, l in enumerate(kmeans.labels_) if l == label])
    
        # Annotate points with sequence identifiers if similarity is below the threshold
        in_threshold_points = []
        for j, seq_label in enumerate(sequences):
            if kmeans.labels_[j] == label:
                similarity_to_centroid = np.linalg.norm(pca_data[j] - kmeans.cluster_centers_[label])
                if similarity_to_centroid < labeling_threshold:
                    in_threshold_points.append((pca_data[j, 0], pca_data[j, 1]))
                    plt.annotate(seq_label.id, (pca_data[j, 0], pca_data[j, 1]), textcoords="offset points", xytext=(5, 5), ha='right')
    
        # Draw circles around the points within the threshold
        if len(in_threshold_points) > 0:
            for point in in_threshold_points:
                circle = plt.Circle((point[0], point[1]), labeling_threshold, color='black', fill=False)
                plt.gca().add_patch(circle)
    
        plt.scatter(cluster_points[:, 0], cluster_points[:, 1], color=colors[i], label=f'Cluster {label}')
    
    plt.legend()
    plt.show()
    

    enter image description here