My program currently loops through a directory of pdf/image files and generates json files using the Azure computer vision REST API.
Besides generating the JSON files, I'd like to extract certain elements from the JSON generated, such as text
and confidence
and write them to a csv file, so that it would look somethin glike this:
e.g. desired csv file output:
filename page text words confidence
file1.pdf 1 the quick brown fox 0.98
file1.pdf 2 the color is great! 0.71
file2.jpeg 1 something something 0.87
However, this is a sample csv output of what im getting currently (goes all the way up to FJO column so impossible to paste all of the columns here):
How do i make the JsonToCsv()
function return the filename, page, text, words, and confidence elements only to the csv file for each file?
here is some of an example JSON generated from one of the files:
{
"status": "succeeded",
"createdDateTime": "2020-05-28T05:13:21Z",
"lastUpdatedDateTime": "2020-05-28T05:13:22Z",
"analyzeResult": {
"version": "3.1.0",
"readResults": [
{
"page": 1,
"language": "en",
"angle": 0.8551,
"width": 2661,
"height": 1901,
"unit": "pixel",
"lines": [
{
"boundingBox": [
67,
646,
2582,
713,
2580,
876,
67,
821
],
"text": "The quick brown fox jumps",
"words": [
{
"boundingBox": [
143,
650,
435,
661,
436,
823,
144,
824
],
"text": "The",
"confidence": 0.958
},
{
"boundingBox": [
full code:
static void Main(string[] args)
{
foreach (var file in Directory.GetFiles(inputFilesPath))
{
string inputFile = Path.GetFileName(file); // Return File Name + Extension
// Call the REST API method.
Console.WriteLine("\nExtracting text from: " + inputFile + "\n");
ReadText(file, jsonFilesPath).Wait();
}
//Output to CSV
foreach (var jsonFile in Directory.GetFiles(jsonFilesPath))
{
string csvFile = Path.GetFileName(csvFilePath); // Return File Name + Extension
// Call the REST API method.
Console.WriteLine("\nWriting/Parsing json to: " + csvFile + " file from: " + jsonFile + "\n");
JsonToCsv(jsonFile, csvFilePath);
}
}
public static void JsonToCsv(string jsonInputFile, string csvFile)
{
// Tools > Nuget Package Manager > Package Manager Console
// Install-Package ChoETL.JSON
using (var r = new ChoJSONReader(jsonInputFile))
{
using (var w = new ChoCSVWriter(csvFile).WithFirstLineHeader())
{
w.Write(r);
}
}
}
static async Task ReadText(string inputFilePath, string outputFilePath)
{
try
{
HttpClient client = new HttpClient();
// Request headers.
client.DefaultRequestHeaders.Add(
"Ocp-Apim-Subscription-Key", subscriptionKey);
string url = uriBase;
HttpResponseMessage response;
string operationLocation;
// Reads the contents of the specified local input
// into a byte array.
byte[] byteData = GetInputAsByteArray(inputFilePath);
// Adds the byte array as an octet stream to the request body.
using (ByteArrayContent content = new ByteArrayContent(byteData))
{
content.Headers.ContentType =
new MediaTypeHeaderValue("application/octet-stream");
response = await client.PostAsync(url, content);
}
if (response.IsSuccessStatusCode)
operationLocation =
response.Headers.GetValues("Operation-Location").FirstOrDefault();
else
{
// Display the JSON error data.
string errorString = await response.Content.ReadAsStringAsync();
Console.WriteLine("\n\nResponse:\n{0}\n",
JToken.Parse(errorString).ToString());
return;
}
string contentString;
int i = 0;
do
{
System.Threading.Thread.Sleep(1000);
response = await client.GetAsync(operationLocation);
contentString = await response.Content.ReadAsStringAsync();
++i;
}
while (i < 60 && contentString.IndexOf("\"status\":\"succeeded\"") == -1);
if (i == 60 && contentString.IndexOf("\"status\":\"succeeded\"") == -1)
{
Console.WriteLine("\nTimeout error.\n");
return;
}
// Display the JSON response.
Console.WriteLine("\nResponse:\n\n{0}\n",
JToken.Parse(contentString).ToString());
// Write JSON response to file
string inputFileName = Path.GetFileNameWithoutExtension(inputFilePath); // Extract File Name only
var responseData = JToken.Parse(contentString).ToString(); //Fetch Data and assign it for file output
string jsonOutputFile = Path.Combine(outputFilePath, inputFileName + ".json");
File.WriteAllText(jsonOutputFile, responseData);
}
catch (Exception e)
{
Console.WriteLine("\n" + e.Message);
}
}
Guess this may point you to right direction to generate expected CSV from JSON
StringBuilder csv = new StringBuilder();
using (var p = new ChoJSONReader("*** YOUR JSON FILE ***")
.WithJSONPath("$..readResults")
)
{
using (var w = new ChoCSVWriter(csv)
.WithFirstLineHeader()
)
{
w.Write(p
.SelectMany(r1 => ((dynamic[])r1.lines).SelectMany(r2 => ((dynamic[])r2.words).Select(r3 => new
{
r1.page,
r2.text,
words = r3.text,
r3.confidence
}))));
}
}
Console.WriteLine(csv.ToString());
Output:
page,text,words,confidence
1,The quick brown fox jumps,The,0.958
1,The quick brown fox jumps,quick,0.57
1,The quick brown fox jumps,brown,0.799
1,The quick brown fox jumps,fox,0.442
1,The quick brown fox jumps,jumps,0.878
1,over,over,0.37
1,the lazy dog!,the,0.909
1,the lazy dog!,lazy,0.853
1,the lazy dog!,dog!,0.41
UPDATE:
Use: ChoETL.JSON v1.2.1.2
To generate CSV with spaced field names, use WithField
to specify the field header for each field.
StringBuilder csv = new StringBuilder();
using (var p = new ChoJSONReader("sample43.json")
.WithJSONPath("$..readResults")
)
{
using (var w = new ChoCSVWriter(csv)
.WithField("FileName", fieldName: "File Name")
.WithField("page")
.WithField("text")
.WithField("words")
.WithField("confidence")
.WithFirstLineHeader()
)
{
w.Write(p
.SelectMany(r1 => ((dynamic[])r1.lines).SelectMany(r2 => ((dynamic[])r2.words).Select(r3 => new
{
FileName = "file.json",
r1.page,
r2.text,
words = r3.text,
r3.confidence
}))));
}
}
Console.WriteLine(csv.ToString());
Output:
File Name,page,text,words,confidence
file.json,1,The quick brown fox jumps,The,0.958
file.json,1,The quick brown fox jumps,quick,0.57
file.json,1,The quick brown fox jumps,brown,0.799
file.json,1,The quick brown fox jumps,fox,0.442
file.json,1,The quick brown fox jumps,jumps,0.878
file.json,1,over,over,0.37
file.json,1,the lazy dog!,the,0.909
file.json,1,the lazy dog!,lazy,0.853
file.json,1,the lazy dog!,dog!,0.41