I have an excel
(.xls
) file that needs to be parsed into a .csv
file. I am using the latest stable version of NPOI
library for c#
. The problem is I am getting ragged right CSV
instead of getting normalized row size.
What the data in excel file looks like
The output csv
file is:
"FirstName","MiddleName","LastName","PhNum"
"John","L","Doe","555-555-5555"
"Little","Ding","Bat"
"Roger","D","Rabbit","123-456-7890"
What I would like for it to happen is add an extra delimeter at the end of the second data row(after "Bat"), like so:
"FirstName","MiddleName","LastName","PhNum"
"John","L","Doe","555-555-5555"
"Little","Ding","Bat",
"Roger","D","Rabbit","123-456-7890"
Here is my code:
public override bool ParseFile()
{
FileStream iFile = new FileStream(InputFileName, FileMode.Open);
HSSFWorkbook wb = new HSSFWorkbook(iFile);
ExcelExtractor extractor = new ExcelExtractor(wb);
extractor.IncludeBlankCells = true;
bool result = true;
if (AllWorksheets)
{
for (int i = 0; i < wb.NumberOfSheets; i++)
result = result && ParseWorksheet(wb, i);
}
else
{
result = ParseWorksheet(wb, 0);
}
return result;
}
protected char c = '"';
public static string FormatValue(string s, bool AddQuotes, char quoteChar)
{
if (AddQuotes)
{
return quoteChar + s + quoteChar;
}
return s;
}
private bool ParseWorksheet(HSSFWorkbook wb, int SheetIndex)
{
bool result = true;
HSSFSheet sheet = (HSSFSheet)wb.GetSheetAt(SheetIndex);
if (sheet.FirstRowNum == sheet.LastRowNum && sheet.LastRowNum == 0) return result;
System.IO.StreamWriter sw = new StreamWriter(OutputFileName, true);
for (int i = sheet.FirstRowNum; i <= sheet.LastRowNum; i++)
{
string OutputRow = String.Empty;
HSSFRow row = (HSSFRow)sheet.GetRow(i);
int Column = 0;
int MaxCol = 0;
int temp = 0;
for (int j = 0; j < row.LastCellNum; j++)
{
temp = row.LastCellNum;
if (temp > MaxCol)
{
MaxCol = temp;
}
}
for (int j = 0; j < MaxCol; j++)
{
if (j == row.Cells[Column].ColumnIndex)
{
switch (row.Cells[Column].CellType)
{
case NPOI.SS.UserModel.CellType.Boolean:
OutputRow += FormatValue(row.Cells[Column].BooleanCellValue.ToString(), AddQuotes, c) + Delimiter.ToString();
break;
case NPOI.SS.UserModel.CellType.Formula:
OutputRow += FormatValue(row.Cells[Column].CachedFormulaResultType.ToString(), AddQuotes, c) + Delimiter.ToString();
break;
case NPOI.SS.UserModel.CellType.Numeric:
OutputRow += FormatValue((NPOI.SS.UserModel.DateUtil.IsCellDateFormatted(row.Cells[Column]) ? row.Cells[Column].DateCellValue.ToShortDateString() : row.Cells[Column].NumericCellValue.ToString()), AddQuotes, c) + Delimiter.ToString();
break;
case NPOI.SS.UserModel.CellType.Blank:
OutputRow += Delimiter.ToString();
break;
case NPOI.SS.UserModel.CellType.String:
OutputRow += FormatValue(row.Cells[Column].StringCellValue.ToString().Replace('\n', ' ').TrimEnd(), AddQuotes, c) + Delimiter.ToString();//replace the new line character to space due to formatting issue.
break;
default:
result = false;
break;
}
Column++;
}
else
{
OutputRow += Delimiter.ToString();
}
}
OutputRow = OutputRow.Remove(OutputRow.Length - 1);
sw.WriteLine(OutputRow);
}
sw.Flush();
sw.Close();
return result;
}`
Any suggestions would be much appreciated.
There are a couple of issues here contributing to the problem.
First, you are recalculating the MaxCol
for each row. If you want a non-ragged right edge, then you need to find the MaxCol
over all rows first, and then generate the output.
Second, you are using row.Cells[]
to try to get a particular cell for the row. Cells[]
ignores empty values. So if you happen to have a blank cell somewhere in the row, then all the remaining values will be shifted to the left, and the length of the array will be less than MaxCol
. This will lead to an exception if you try to access row.Cells[MaxCol - 1]
on rows that have at least one blank value.
The solution to this problem is to use the row.GetCell(index)
method instead. This method returns the cell at column index
(0-based), or null
if that cell is empty. It is much more straightforward to work with, and will allow you to eliminate the special logic in your code which checks the ColumnIndex
of the current cell against your loop index j
to make sure the cell you got is really in the column you expected.
As an additional suggestion, I would recommend retrieving the current cell just once inside your inner loop and assigning it to a variable instead of re-retrieving it multiple times. This will make your code a little more efficient and easier to read.
Here is the revised code for the ParseWorksheet
method with all of the above changes:
private bool ParseWorksheet(HSSFWorkbook wb, int SheetIndex)
{
bool result = true;
HSSFSheet sheet = (HSSFSheet)wb.GetSheetAt(SheetIndex);
if (sheet.FirstRowNum == sheet.LastRowNum && sheet.LastRowNum == 0) return result;
StreamWriter sw = new StreamWriter(OutputFileName, true);
int MaxCol = 0;
for (int i = sheet.FirstRowNum; i <= sheet.LastRowNum; i++)
{
HSSFRow row = (HSSFRow)sheet.GetRow(i);
MaxCol = Math.Max(MaxCol, row.LastCellNum);
}
for (int i = sheet.FirstRowNum; i <= sheet.LastRowNum; i++)
{
string OutputRow = String.Empty;
HSSFRow row = (HSSFRow)sheet.GetRow(i);
for (int j = 0; j < MaxCol; j++)
{
HSSFCell cell = (HSSFCell)row.GetCell(j);
if (cell != null)
{
switch (cell.CellType)
{
case NPOI.SS.UserModel.CellType.Boolean:
OutputRow += FormatValue(cell.BooleanCellValue.ToString(), AddQuotes, c) + Delimiter;
break;
case NPOI.SS.UserModel.CellType.Formula:
OutputRow += FormatValue(cell.CachedFormulaResultType.ToString(), AddQuotes, c) + Delimiter;
break;
case NPOI.SS.UserModel.CellType.Numeric:
OutputRow += FormatValue((NPOI.SS.UserModel.DateUtil.IsCellDateFormatted(cell) ? cell.DateCellValue.ToShortDateString() : cell.NumericCellValue.ToString()), AddQuotes, c) + Delimiter;
break;
case NPOI.SS.UserModel.CellType.Blank:
OutputRow += Delimiter;
break;
case NPOI.SS.UserModel.CellType.String:
OutputRow += FormatValue(cell.StringCellValue.ToString().Replace('\n', ' ').TrimEnd(), AddQuotes, c) + Delimiter; //replace the new line character to space due to formatting issue.
break;
default:
result = false;
break;
}
}
else
{
OutputRow += Delimiter;
}
}
OutputRow = OutputRow.Remove(OutputRow.Length - 1);
sw.WriteLine(OutputRow);
}
sw.Flush();
sw.Close();
return result;
}