Search code examples
c#jsoncsvchoetl

How to aggregate confidence levels in csv?


My program currently loops through a directory of pdf/image files and generates json files using the Azure computer vision REST API. Using the JsonToCsv() below, I export specific json elements from those files into csv file, so the the output looks like this:

file.csv output:

page,text,words,confidence
1,The quick brown fox jumps,The,0.958
1,The quick brown fox jumps,quick,0.57
1,The quick brown fox jumps,brown,0.799
1,The quick brown fox jumps,fox,0.442
1,The quick brown fox jumps,jumps,0.878
1,over,over,0.37
1,the lazy dog!,the,0.909
1,the lazy dog!,lazy,0.853
1,the lazy dog!,dog!,0.41

what i'd like to do is consolidate the words so that they are delimited by comma instead of separate row, and therefore, average out the confidence levels for the entire text containing those words. For example, the new file would like:

page,text,words,confidence
1,The quick brown fox jumps,The,quick,brown,fox,jumps,0.729
1,over,over,0.37
1,the lazy dog!,the,lazy,dog!,0.724

in which 0.729 in the first text is the result/average of relevant confidence levels combined and divided: e.g. (0.958+0.57+0.799+0.442+0.878)/5. same operation done performed on the last text as well.

How do i update the function below to accomplish this?

JsonToCsv() code:

private static void JsonToCsv(string jsonFile, string csvfFile) {   
    using (var p = new ChoJSONReader(jsonFile)
        .WithJSONPath("$..readResults")
        )
    {
        using (var w = new ChoCSVWriter(csvfFile).WithFirstLineHeader())
        {
            w.Write(p
                .SelectMany(r1 => ((dynamic[])r1.lines).SelectMany(r2 => ((dynamic[])r2.words).Select(r3 => new
                {
                    r1.page,
                    r2.text,
                    words = r3.text,
                    r3.confidence
                }))));
        }
    }
}

sample JSON file:

{
  "status": "succeeded",
  "createdDateTime": "2020-05-28T05:13:21Z",
  "lastUpdatedDateTime": "2020-05-28T05:13:22Z",
  "analyzeResult": {
    "version": "3.1.0",
    "readResults": [
      {
        "page": 1,
        "language": "en",
        "angle": 0.8551,
        "width": 2661,
        "height": 1901,
        "unit": "pixel",
        "lines": [
          {
            "boundingBox": [
              67,
              646,
              2582,
              713,
              2580,
              876,
              67,
              821
            ],
            "text": "The quick brown fox jumps",
            "words": [
              {
                "boundingBox": [
                  143,
                  650,
                  435,
                  661,
                  436,
                  823,
                  144,
                  824
                ],
                "text": "The",
                "confidence": 0.958
              },
              {
                "boundingBox": [
                  540,
                  665,
                  926,
                  679,
                  926,
                  825,
                  541,
                  823
                ],
                "text": "quick",
                "confidence": 0.57
              },
              {
                "boundingBox": [
                  1125,
                  686,
                  1569,
                  700,
                  1569,
                  838,
                  1125,
                  828
                ],
                "text": "brown",
                "confidence": 0.799
              },
              {
                "boundingBox": [
                  1674,
                  703,
                  1966,
                  711,
                  1966,
                  851,
                  1674,
                  841
                ],
                "text": "fox",
                "confidence": 0.442
              },
              {
                "boundingBox": [
                  2083,
                  714,
                  2580,
                  725,
                  2579,
                  876,
                  2083,
                  855
                ],
                "text": "jumps",
                "confidence": 0.878
              }
            ]
          },
          {
            "boundingBox": [
              187,
              1062,
              485,
              1056,
              486,
              1120,
              189,
              1126
            ],
            "text": "over",
            "words": [
              {
                "boundingBox": [
                  190,
                  1064,
                  439,
                  1059,
                  441,
                  1122,
                  192,
                  1126
                ],
                "text": "over",
                "confidence": 0.37
              }
            ]
          },
          {
            "boundingBox": [
              664,
              1008,
              1973,
              1023,
              1969,
              1178,
              664,
              1154
            ],
            "text": "the lazy dog!",
            "words": [
              {
                "boundingBox": [
                  668,
                  1008,
                  923,
                  1015,
                  923,
                  1146,
                  669,
                  1117
                ],
                "text": "the",
                "confidence": 0.909
              },
              {
                "boundingBox": [
                  1107,
                  1018,
                  1447,
                  1023,
                  1445,
                  1178,
                  1107,
                  1162
                ],
                "text": "lazy",
                "confidence": 0.853
              },
              {
                "boundingBox": [
                  1639,
                  1024,
                  1974,
                  1023,
                  1971,
                  1170,
                  1636,
                  1178
                ],
                "text": "dog!",
                "confidence": 0.41
              }
            ]
          }
        ]
      }
    ]
  }
}

Solution

  • Using Linq you can produce the CSV in expected format. Sample below shows how

    StringBuilder csv = new StringBuilder();
    using (var p = new ChoJSONReader("*** YOUR JSON PATH ***")
        .WithJSONPath("$..readResults")
        )
    {
        using (var w = new ChoCSVWriter(csv)
            .WithFirstLineHeader()
            )
        {
            w.Write(p
                .SelectMany(r1 => ((dynamic[])r1.lines)
                .Select(r2 => new
                {
                    r1.page,
                    r2.text,
                    words = String.Join(",", ((dynamic[])r2.words).Select(s1 => s1.text)),
                    confidence = ((dynamic[])r2.words).Select(s1 => (double)s1.confidence).Average()
                })));
        }
    }
    
    Console.WriteLine(csv.ToString());