»

Dec 02

OpenXML Excel Quirk

I have discovered a nasty gremlin when using the OpenXML SDK to read data from an Excel SpreadSheet.

While reading the cells in a row using the OpenXML SDK I discovered that that the collection of Cells returned for a given Row is sparse (i.e. has missing cells)

In the Sheet I was processing In Row 58 Cells B58, C58 & D58 are empty, A58 & E58 contain strings.  If you look at the XML fragment below you will see that there is no XML for Cell C58 in the XML

<row r="58" spans="1:5">
  <c r="A58" s="17" t="">
    <v>97</v>
  </c> <c r="B58" s="19"/>
  <c r="D5" s="17"/>
  <c r="E5" s="15" t="">
    <v>16</v>
  </c>
</row>

The really weird thing is that Cells B58 and D58 are also empty

This really messed up things as I needed a string array with values in the correct column indexes

The code I ended up with to convert the row to a string array is listed below

// The ConvertRowToStringArray routine had to be written to cope with :-
// OpenXML returns a sparse array of cells for a given row
// i.e. Cell[A1],Cell[B1],Cell[D1],Cell[F1]
// Simply looping round the array of cells without reading their Cell Reference
// causes bugs due to data being in the wrong columns.
private static string[] ConvertRowToStringArray(WorkbookPart workbookPart, Row r)
{
    // Determine last cell in the row
    int lastCell = 1;
    string rowSpan = r.Spans;
    if (!string.IsNullOrEmpty(rowSpan))
    {
        string[] minmax = rowSpan.Split(':');
        lastCell = int.Parse(minmax[1]);
    }

    // Initialise array of cells to be returned
    string[] cells = new string[lastCell];
    for (int i = 0; i < lastCell; i++)
    {
        cells[i] = string.Empty;
    }

    // Loop round each cell in the row
    foreach (Cell c in r.Elements<Cell>())
    {
        // Use regular expression to split the Excel Cell Reference
        // into it's Column and Row parts
        int columnIndex = -1;
        Regex rex = new Regex("^(?<Col>[A-Z]+)(?<Row>[0-9]+)$");
        Match match = rex.Match(c.CellReference);
        if (match.Success)
        {
            columnIndex = ExcelColumnNameToNumber(match.Groups["Col"].Value) - 1;
        }

        // Convert DataType to a string so that we can use it in a switch statement
        string dataType = string.Empty;
        if (c.DataType != null)
        {
            dataType = c.DataType.Value.ToString();
        }

        // Get the value from the cell
        string value = string.Empty;
        string text = string.Empty;
        switch (dataType)
        {
            // Numbers - return as is
            case "":
            if (c.CellValue != null)
            {
                value = c.CellValue.Text;
                text = value;
            }
            break;

            // Strings - lookup the string from it's pointer
            case "SharedString":
            value = c.CellValue.Text;
            int index = int.Parse(value);
            text = GetSharedStringById(workbookPart, index);
            break;

            // Formula - Retun empty string
            case "String":
            text = "";
            break;

            // Other - Need to code this later if we encounter it ...
            default:
            text = "???";
            System.Diagnostics.Debug.WriteLine("Cell Datatype '" + dataType + "' not known");
            break;
        }
        cells[columnIndex] = text.Trim();
    }

    return cells;
}

// Convert Excel Column Name to a Number we can use as an array index
public static int ExcelColumnNameToNumber(string columnName)
{
    char[] characters = columnName.ToCharArray();

    int sum = 0;

    for (int i = 0; i < characters.Length; i++)
    {
        sum *= 26;
        sum += (characters[i] - 'A' + 1);
    }

    return sum;
}

private static WorksheetPart GetFirstWorkSheetPart(SpreadsheetDocument document, string sheetName)
{
    WorksheetPart worksheetPart = null;

    IEnumerable<Sheet> sheets = document.WorkbookPart.Workbook.GetFirstChild<Sheets>().Elements<Sheet>()
        .Where(s => s.Name == sheetName);

    if (sheets.Count() != 0)
    {
        string id = sheets.First().Id.Value;
        worksheetPart = (WorksheetPart)document.WorkbookPart.GetPartById(id);
    }

    return worksheetPart;
}

private static string GetSharedStringById(WorkbookPart workbookPart, int id)
{
    SharedStringItem item = workbookPart.SharedStringTablePart.SharedStringTable.Elements<SharedStringItem>().ElementAt(id);
    return item.Text.InnerText;
}

How to get at the raw XML

As most of you should know (by now) the new Office 2007+ format is a container for a set of files (which are mainly XML) stored in compressed zip format.  Hint: To see the files inside rename an .xlsx file to .zip, then examine the contents using WinZip or similar and will be revealed.

Leave a Reply

Your email address will not be published. Required fields are marked *

You may use these HTML tags and attributes: <a href="" title=""> <abbr title=""> <acronym title=""> <b> <blockquote cite=""> <cite> <code> <del datetime=""> <em> <i> <q cite=""> <s> <strike> <strong>