Search code examples
pythonexcelpandasxlsx

read only certain columns


I would like to only read certain columns into a pandas dataframe from an excel file. I would like to specify this via the column ranges via usecols to specify this, but am faced with an error. What is the correct way to specify the column ranges as letters (yes, I know I could provide the column indicies, instead)?

import pandas as pd
df = pd.read_excel("test.xlsx", usecols="A:AV", header=None)

~/anaconda3/lib/python3.5/site-packages/pandas/io/excel.py in read_excel(io, sheetname, header, skiprows, skip_footer, index_col, names, parse_cols, parse_dates, date_parser, na_values, thousands, convert_float, has_index_names, converters, dtype, true_values, false_values, engine, squeeze, **kwds) 207 skip_footer=skip_footer, converters=converters, dtype=dtype, 208 true_values=true_values, false_values=false_values, squeeze=squeeze, --> 209 **kwds) 210 211

~/anaconda3/lib/python3.5/site-packages/pandas/io/excel.py in _parse_excel(self, sheetname, header, skiprows, names, skip_footer, index_col, has_index_names, parse_cols, parse_dates, date_parser, na_values, thousands, convert_float, true_values, false_values, verbose, dtype, squeeze, **kwds) 508 squeeze=squeeze, 509 dtype=dtype, --> 510 **kwds) 511 512 output[asheetname] = parser.read()

~/anaconda3/lib/python3.5/site-packages/pandas/io/parsers.py in TextParser(*args, **kwds) 1910 """ 1911 kwds['engine'] = 'python' -> 1912 return TextFileReader(*args, **kwds) 1913 1914

~/anaconda3/lib/python3.5/site-packages/pandas/io/parsers.py in init(self, f, engine, **kwds) 762 self.options['has_index_names'] = kwds['has_index_names'] 763 --> 764 self._make_engine(self.engine) 765 766 def close(self):

~/anaconda3/lib/python3.5/site-packages/pandas/io/parsers.py in _make_engine(self, engine) 993 ' "c", "python", or' ' "python-fwf")'.format( 994 engine=engine)) --> 995 self._engine = klass(self.f, **self.options) 996 997 def _failover_to_python(self):

~/anaconda3/lib/python3.5/site-packages/pandas/io/parsers.py in init(self, f, **kwds) 1994 # infer column indices from self.usecols if is is specified. 1995 self._col_indices = None -> 1996 self.columns, self.num_original_columns = self._infer_columns() 1997 1998 # Now self.columns has the set of columns that we will process.

~/anaconda3/lib/python3.5/site-packages/pandas/io/parsers.py in _infer_columns(self) 2387 else: 2388 columns = [lrange(ncols)] -> 2389 columns = self._handle_usecols(columns, columns[0]) 2390 else: 2391 if self.usecols is None or len(names) >= num_original_columns:

~/anaconda3/lib/python3.5/site-packages/pandas/io/parsers.py in _handle_usecols(self, columns, usecols_key) 2422 for col in self.usecols: 2423 if isinstance(col, string_types): -> 2424 col_indices.append(usecols_key.index(col)) 2425
else: 2426 col_indices.append(col)

In [20]: pd.__version__
Out[20]: '0.20.2'

file: test.xlsx


Solution

  • Need upgrade to pandas 0.21.0 (dev discussion):

    df = pd.read_excel("test.xlsx", usecols='A:B', header=None)                              
    print (df.head())
    
              0                    1
    0  sequence  2015-02-11 00:00:00
    1        Aa                 6239
    2    AaBpL5                  NaN
    3    AaCjL5                  NaN
    4    AaDrL2                  NaN