I am working on a project which need to read the data from the GCS bucket using C# CsvHelper, then create a GBQ table and insert data into this GBQ table. However, the data got saved into GBQ is positioned in wrong places.
First, reading the data from GBQ:
public async Task<MemoryStream> ReadCsvRecord(string url)
{
Uri gcsUri = new Uri(url);
// first segment is the '/' and second is the bucket name. Therefore the path should always begin with the third segment
var path = String.Join("", gcsUri.Segments, 1, gcsUri.Segments.Length - 1);
var stream = new MemoryStream();
await _storageClient.DownloadObjectAsync(_bucketName, path, stream);
stream.Position = 0;
return stream;
}
Second, create the gbq table and insert the data:
MemoryStream gcsPayloadStream = await gcsReader.ReadCsvRecord(plannedTripUrl);
var reader = new StreamReader(gcsPayloadStream, Encoding.UTF8, true);
var projectId = this.gbqOpsClient.ProjectId;
using (reader)
{
using (CsvReader csvr = new CsvReader(reader, CultureInfo.CurrentCulture))
{
csvr.Read();
csvr.ReadHeader();
var bigQueryTableInsertRows = new BigQueryTableInsertRows(tableId, projectId);
string[] headerRowItems = csvr.Context.Reader.HeaderRecord;
var i = 0;
while (csvr.Read())
{
var key = "";
var value = "";
var j = 0;
ArrayList rows = new ArrayList();
while (j < headerRowItems.Length)
{
key = headerRowItems[j];
value = csvr.GetField<string>(headerRowItems[j]);
rows.Add(
new BigQueryInsertRow($"{i}{j}")
{
{key, value}
}
);
j++;
}
var rowArr = rows.ToArray(typeof(BigQueryInsertRow)) as BigQueryInsertRow[];
bigQueryTableInsertRows.TableInsertRows(rowArr);
i++;
}
}
}
The BigQueryTableInsertRows refers Google official documents: https://cloud.google.com/bigquery/docs/samples/bigquery-table-insert-rows
public class BigQueryTableInsertRows
{
private readonly string tableId;
private readonly string projectId;
private readonly string datasetId;
public BigQueryTableInsertRows(
string tableId,
string projectId,
string datasetId = "my_datasetId")
{
this.tableId = tableId;
this.projectId = projectId;
this.datasetId = datasetId;
}
public void TableInsertRows(BigQueryInsertRow[] rows)
{
BigQueryClient client = BigQueryClient.Create(projectId);
client.InsertRows(this.datasetId, this.tableId, rows);
}
}
}
However, the final result I got in the GBQ table is like this and only partial data get saved:
C1 C2 C3 C4
1A
2B
3C
4D
The 1A 2B 3C 4D should be in the same row, but now they are positioned in different rows. Anyone knows what's wrong with my code? Many Thanks!
I have never used a GBQ table, but it looks like you are creating a new row for every column value instead of just at the beginning of the reading of the CSV row. Does something like this work?
using (CsvReader csvr = new CsvReader(reader, CultureInfo.CurrentCulture))
{
csvr.Read();
csvr.ReadHeader();
var bigQueryTableInsertRows = new BigQueryTableInsertRows(tableId, projectId);
string[] headerRowItems = csvr.Context.Reader.HeaderRecord;
var i = 0;
ArrayList rows = new ArrayList();
while (csvr.Read())
{
var key = "";
var value = "";
var j = 0;
var bigQueryRow = new BigQueryInsertRow($"{i}")
while (j < headerRowItems.Length)
{
key = headerRowItems[j];
value = csvr.GetField<string>(headerRowItems[j]);
bigQueryRow.Add(key, value);
j++;
}
i++;
rows.Add(bigQueryRow);
}
var rowArr = rows.ToArray(typeof(BigQueryInsertRow)) as BigQueryInsertRow[];
bigQueryTableInsertRows.TableInsertRows(rowArr);
}