Search code examples
jsonpython-3.x

Python to convert tab delimited file as json with supplied argument and additional keys


I'm looking to convert the two column tab delimited file to a 'key:value' and dump as json format, together with set of keys and argument sample_id = 'WGNP1000001'. Here's my input and expected output format and the code. Appreciate any help. Thanks

Tab delimited input file: WGNP1000001.list.txt

insert_size 447.3
insert_size_std 98.2
pct_properly_paired 97.9
pct_mapped  99.63

Expected output JSON format:

{
    "sample": {
        "id": "WGNP1000001"
    },
    "wgs_metrics": {
        "insert_size_std": 98.2,
        "insert_size": 447.3,
        "pct_mapped": 99.63,
        "pct_properly_paired": 97.9
    }
}
Sample_id = 'WGNP1000001'`

count_aln.py --input_metrics "${sample_id}.list.txt" --sample_id ${sample_id} --output_json ${sample_id}.metrics.json

Code:

#!/usr/bin/env python3

import argparse
import json
import subprocess
import numpy as np
import sys
import os
from pathlib import Path


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--sample_id", dest="sample_id", required=True,
                        default=None,
                        help="Sample ID")
    parser.add_argument("--input_metrics", dest="input_metrics", required=True,
                        default=None,
                        help="Path to input aln metrics list")
    parser.add_argument("--output_json", dest="output_json", required=False,
                        default="./variant_counts.json",
                        help="Path to output file for variant metrics. Default: ./variant_counts.json")
    parser.add_argument("--scratch_dir", dest="scratch_dir", required=False,
                        default="./",
                        help="Path to scratch dir. Default: ./")
    args = parser.parse_args()

    # create scratch dir if it doesn't exist
    Path(args.scratch_dir).mkdir(parents=True, exist_ok=True)

    return args

    def raw_data(input_metrics):
        d = {}
        # d = dict()
        with open(input_metrics) as f:
            rows = ( line.split('\t') for line in f )
            d = { row[0]:row[1] for row in rows }
            return d

def save_output(data_metrics, outfile):
    with open(outfile, "w") as f:
        data_metrics = {"sample" : {"id" : args.sample_id}, "wgs_metrics" : data_metrics}
        json.dump(data_metrics, f, sort_keys=True, indent=4)
        f.write("\n")

if __name__ == "__main__":
    args = parse_args()

    data_metrics = raw_data(args.input_metrics)
    save_output(data_metrics, args.output_json)

Output:

{
    "sample": {
        "id": "WGNP1000001"
    },
    "wgs_metrics": {
        "insert_size": "447.3\n",
        "insert_size_std": "98.2\n",
        "pct_mapped": "99.63\n",
        "pct_properly_paired": "97.9\n"
    }
}

Input value with and without float

insert_size 447.3
insert_size_std 98.2
pct_properly_paired 97.9
pct_mapped  99.63
yield_bp_q30    1996315
mean_autosome_coverage  0.000644
pct_autosomes_15x   0.000016
mad_autosome_coverage   0

Output of the new code

{
    "sample": {
        "id": "WGNP1000001"
    },
    "wgs_metrics": {
        "insert_size": 447.3,
        "insert_size_std": 98.2,
        "mad_autosome_coverage": 0.0,
        "mean_autosome_coverage": 0.000644,
        "pct_autosomes_15x": 1.6e-05,
        "pct_mapped": 99.63,
        "pct_properly_paired": 97.9,
        "yield_bp_q30": 1996315.0
    }
}

expected Output without adding float or change to log value to pct_autosomes_15x yield_bp_q30 mad_autosome_coverage

{
    "sample": {
        "id": "WGNP1000001"
    },
    "wgs_metrics": {
        "insert_size": 447.3,
        "insert_size_std": 98.2,
        "mad_autosome_coverage": 0,
        "mean_autosome_coverage": 0.000644,
        "pct_autosomes_15x": 0.000016,
        "pct_mapped": 99.63,
        "pct_properly_paired": 97.9,
        "yield_bp_q30": 1996315
    }
}

Try1:

def raw_data(input_metrics):
    d = {}
    with open(input_metrics) as f:
        for line in f:
            if not line.strip():
                continue
            row = line.split('\t')
            key = row[0]
            value_str = row[1]
            d[key] = value_str.replace("\n", "")
            #try:
            #    value = float(value_str.strip())
            #except ValueError:
            #    value = value_str.strip()
            #d[key] = value
    return d

Try1 Output

{
    "sample": {
        "id": "NA12878"
    },
    "wgs_metrics": {
        "insert_size": "447.3",
        "insert_size_std": "98.2",
        "mad_autosome_coverage": "0",
        "mean_autosome_coverage": "0.000644",
        "pct_autosomes_15x": "0.000016",
        "pct_mapped": "99.63",
        "pct_properly_paired": "97.9",
        "yield_bp_q30": "1996315"
    }
}

Solution

  • You need to parse the contents of the second item of each row from strings to floats.

    def raw_data(input_metrics):
        d = {}
        with open(input_metrics) as f:
            for line in f:
                if not line.strip():
                    continue
                row = line.split('\t')
                key = row[0]
                value_str = row[1]
                try:
                    value = float(value_str.strip())
                except ValueError:
                    value = value_str.strip()
                d[key] = value
        return d