I have discovered a nasty gremlin when using the OpenXML SDK to read data from an Excel SpreadSheet.
While reading the cells in a row using the OpenXML SDK I discovered that that the collection of Cells returned for a given Row is sparse (i.e. has missing cells)
In the Sheet I was processing In Row 58 Cells B58, C58 & D58 are empty, A58 & E58 contain strings. If you look at the XML fragment below you will see that there is no XML for Cell C58 in the XML
<row r="58" spans="1:5">
<c r="A58" s="17" t="">
<v>97</v>
</c> <c r="B58" s="19"/>
<c r="D5" s="17"/>
<c r="E5" s="15" t="">
<v>16</v>
</c>
</row>
The really weird thing is that Cells B58 and D58 are also empty
This really messed up things as I needed a string array with values in the correct column indexes
The code I ended up with to convert the row to a string array is listed below
// The ConvertRowToStringArray routine had to be written to cope with :-
// OpenXML returns a sparse array of cells for a given row
// i.e. Cell[A1],Cell[B1],Cell[D1],Cell[F1]
// Simply looping round the array of cells without reading their Cell Reference
// causes bugs due to data being in the wrong columns.
private static string[] ConvertRowToStringArray(WorkbookPart workbookPart, Row r)
{
// Determine last cell in the row
int lastCell = 1;
string rowSpan = r.Spans;
if (!string.IsNullOrEmpty(rowSpan))
{
string[] minmax = rowSpan.Split(':');
lastCell = int.Parse(minmax[1]);
}
// Initialise array of cells to be returned
string[] cells = new string[lastCell];
for (int i = 0; i < lastCell; i++)
{
cells[i] = string.Empty;
}
// Loop round each cell in the row
foreach (Cell c in r.Elements<Cell>())
{
// Use regular expression to split the Excel Cell Reference
// into it's Column and Row parts
int columnIndex = -1;
Regex rex = new Regex("^(?<Col>[A-Z]+)(?<Row>[0-9]+)$");
Match match = rex.Match(c.CellReference);
if (match.Success)
{
columnIndex = ExcelColumnNameToNumber(match.Groups["Col"].Value) - 1;
}
// Convert DataType to a string so that we can use it in a switch statement
string dataType = string.Empty;
if (c.DataType != null)
{
dataType = c.DataType.Value.ToString();
}
// Get the value from the cell
string value = string.Empty;
string text = string.Empty;
switch (dataType)
{
// Numbers - return as is
case "":
if (c.CellValue != null)
{
value = c.CellValue.Text;
text = value;
}
break;
// Strings - lookup the string from it's pointer
case "SharedString":
value = c.CellValue.Text;
int index = int.Parse(value);
text = GetSharedStringById(workbookPart, index);
break;
// Formula - Retun empty string
case "String":
text = "";
break;
// Other - Need to code this later if we encounter it ...
default:
text = "???";
System.Diagnostics.Debug.WriteLine("Cell Datatype '" + dataType + "' not known");
break;
}
cells[columnIndex] = text.Trim();
}
return cells;
}
// Convert Excel Column Name to a Number we can use as an array index
public static int ExcelColumnNameToNumber(string columnName)
{
char[] characters = columnName.ToCharArray();
int sum = 0;
for (int i = 0; i < characters.Length; i++)
{
sum *= 26;
sum += (characters[i] - 'A' + 1);
}
return sum;
}
private static WorksheetPart GetFirstWorkSheetPart(SpreadsheetDocument document, string sheetName)
{
WorksheetPart worksheetPart = null;
IEnumerable<Sheet> sheets = document.WorkbookPart.Workbook.GetFirstChild<Sheets>().Elements<Sheet>()
.Where(s => s.Name == sheetName);
if (sheets.Count() != 0)
{
string id = sheets.First().Id.Value;
worksheetPart = (WorksheetPart)document.WorkbookPart.GetPartById(id);
}
return worksheetPart;
}
private static string GetSharedStringById(WorkbookPart workbookPart, int id)
{
SharedStringItem item = workbookPart.SharedStringTablePart.SharedStringTable.Elements<SharedStringItem>().ElementAt(id);
return item.Text.InnerText;
}
How to get at the raw XML
As most of you should know (by now) the new Office 2007+ format is a container for a set of files (which are mainly XML) stored in compressed zip format. Hint: To see the files inside rename an .xlsx file to .zip, then examine the contents using WinZip or similar and will be revealed.