My program currently loops through a directory of pdf/image files and generates json files using the Azure computer vision REST API. Using the JsonToCsv()
below, I export specific json elements from those files into csv file, so the the output looks like this:
file.csv output:
page,text,words,confidence
1,The quick brown fox jumps,The,0.958
1,The quick brown fox jumps,quick,0.57
1,The quick brown fox jumps,brown,0.799
1,The quick brown fox jumps,fox,0.442
1,The quick brown fox jumps,jumps,0.878
1,over,over,0.37
1,the lazy dog!,the,0.909
1,the lazy dog!,lazy,0.853
1,the lazy dog!,dog!,0.41
what i'd like to do is consolidate the words so that they are delimited by comma instead of separate row, and therefore, average out the confidence levels for the entire text containing those words. For example, the new file would like:
page,text,words,confidence
1,The quick brown fox jumps,The,quick,brown,fox,jumps,0.729
1,over,over,0.37
1,the lazy dog!,the,lazy,dog!,0.724
in which 0.729
in the first text is the result/average of relevant confidence levels combined and divided: e.g. (0.958+0.57+0.799+0.442+0.878)/5
. same operation done performed on the last text as well.
How do i update the function below to accomplish this?
JsonToCsv()
code:
private static void JsonToCsv(string jsonFile, string csvfFile) {
using (var p = new ChoJSONReader(jsonFile)
.WithJSONPath("$..readResults")
)
{
using (var w = new ChoCSVWriter(csvfFile).WithFirstLineHeader())
{
w.Write(p
.SelectMany(r1 => ((dynamic[])r1.lines).SelectMany(r2 => ((dynamic[])r2.words).Select(r3 => new
{
r1.page,
r2.text,
words = r3.text,
r3.confidence
}))));
}
}
}
sample JSON file:
{
"status": "succeeded",
"createdDateTime": "2020-05-28T05:13:21Z",
"lastUpdatedDateTime": "2020-05-28T05:13:22Z",
"analyzeResult": {
"version": "3.1.0",
"readResults": [
{
"page": 1,
"language": "en",
"angle": 0.8551,
"width": 2661,
"height": 1901,
"unit": "pixel",
"lines": [
{
"boundingBox": [
67,
646,
2582,
713,
2580,
876,
67,
821
],
"text": "The quick brown fox jumps",
"words": [
{
"boundingBox": [
143,
650,
435,
661,
436,
823,
144,
824
],
"text": "The",
"confidence": 0.958
},
{
"boundingBox": [
540,
665,
926,
679,
926,
825,
541,
823
],
"text": "quick",
"confidence": 0.57
},
{
"boundingBox": [
1125,
686,
1569,
700,
1569,
838,
1125,
828
],
"text": "brown",
"confidence": 0.799
},
{
"boundingBox": [
1674,
703,
1966,
711,
1966,
851,
1674,
841
],
"text": "fox",
"confidence": 0.442
},
{
"boundingBox": [
2083,
714,
2580,
725,
2579,
876,
2083,
855
],
"text": "jumps",
"confidence": 0.878
}
]
},
{
"boundingBox": [
187,
1062,
485,
1056,
486,
1120,
189,
1126
],
"text": "over",
"words": [
{
"boundingBox": [
190,
1064,
439,
1059,
441,
1122,
192,
1126
],
"text": "over",
"confidence": 0.37
}
]
},
{
"boundingBox": [
664,
1008,
1973,
1023,
1969,
1178,
664,
1154
],
"text": "the lazy dog!",
"words": [
{
"boundingBox": [
668,
1008,
923,
1015,
923,
1146,
669,
1117
],
"text": "the",
"confidence": 0.909
},
{
"boundingBox": [
1107,
1018,
1447,
1023,
1445,
1178,
1107,
1162
],
"text": "lazy",
"confidence": 0.853
},
{
"boundingBox": [
1639,
1024,
1974,
1023,
1971,
1170,
1636,
1178
],
"text": "dog!",
"confidence": 0.41
}
]
}
]
}
]
}
}
Using Linq
you can produce the CSV in expected format. Sample below shows how
StringBuilder csv = new StringBuilder();
using (var p = new ChoJSONReader("*** YOUR JSON PATH ***")
.WithJSONPath("$..readResults")
)
{
using (var w = new ChoCSVWriter(csv)
.WithFirstLineHeader()
)
{
w.Write(p
.SelectMany(r1 => ((dynamic[])r1.lines)
.Select(r2 => new
{
r1.page,
r2.text,
words = String.Join(",", ((dynamic[])r2.words).Select(s1 => s1.text)),
confidence = ((dynamic[])r2.words).Select(s1 => (double)s1.confidence).Average()
})));
}
}
Console.WriteLine(csv.ToString());