Search code examples
python-3.xpandasdataframebyteminio

minIO Bucket | convert Bytes to Dataframe


Goal: create a pandas dataframe from bytes object

I assume there's a standard procedure. I've not dealt with bytes before. I can see there's consistency with \r\n.

\r - an escape sequence

\n - newline/ row/ record

import pandas as pd
from sdg.datasource.MinioConn import MinioConn

client = MinioConn().client()
obj = client.get_object('project', 'foo/bar/Citizenship.csv')
data = obj.data  # .decode('utf-8-sig')

print(data)
print(type(data))

Output:

b'\xef\xbb\xbfCitizenship\r\nAfghan\r\nAlbanian\r\nAlgerian\r\nAmerican\r\nAndorran\r\nAngolan\r\nAnguillan\r\nArgentine\r\nArmenian\r\nAustralian\r\nAustrian\r\nAzerbaijani\r\nBahamian\r\nBahraini\r\nBangladeshi\r\nBarbadian\r\nBelarusian\r\nBelgian\r\nBelizean\r\nBeninese\r\nBermudian\r\nBhutanese\r\nBolivian\r\nBotswanan\r\nBrazilian\r\nBritish\r\nBritish Virgin Islander\r\nBruneian\r\nBulgarian\r\nBurkinan\r\nBurmese\r\nBurundian\r\nCambodian\r\nCameroonian\r\nCanadian\r\nCape Verdean\r\nCayman Islander\r\nCentral African\r\nChadian\r\nChilean\r\nChinese\r\nCitizen of Antigua and Barbuda\r\nCitizen of Bosnia and Herzegovina\r\nCitizen of Guinea-Bissau\r\nCitizen of Kiribati\r\nCitizen of Seychelles\r\nCitizen of the Dominican Republic\r\nCitizen of Vanuatu\r\nColombian\r\nComoran\r\nCongolese (Congo)\r\nCongolese (DRC)\r\nCook Islander\r\nCosta Rican\r\nCroatian\r\nCuban\r\nCymraes\r\nCymro\r\nCypriot\r\nCzech\r\nDanish\r\nDjiboutian\r\nDominican\r\nDutch\r\nEast Timorese\r\nEcuadorean\r\nEgyptian\r\nEmirati\r\nEnglish\r\nEquatorial Guinean\r\nEritrean\r\nEstonian\r\nEthiopian\r\nFaroese\r\nFijian\r\nFilipino\r\nFinnish\r\nFrench\r\nGabonese\r\nGambian\r\nGeorgian\r\nGerman\r\nGhanaian\r\nGibraltarian\r\nGreek\r\nGreenlandic\r\nGrenadian\r\nGuamanian\r\nGuatemalan\r\nGuinean\r\nGuyanese\r\nHaitian\r\nHonduran\r\nHong Konger\r\nHungarian\r\nIcelandic\r\nIndian\r\nIndonesian\r\nIranian\r\nIraqi\r\nIrish\r\nIsraeli\r\nItalian\r\nIvorian\r\nJamaican\r\nJapanese\r\nJordanian\r\nKazakh\r\nKenyan\r\nKittitian\r\nKosovan\r\nKuwaiti\r\nKyrgyz\r\nLao\r\nLatvian\r\nLebanese\r\nLiberian\r\nLibyan\r\nLiechtenstein citizen\r\nLithuanian\r\nLuxembourger\r\nMacanese\r\nMacedonian\r\nMalagasy\r\nMalawian\r\nMalaysian\r\nMaldivian\r\nMalian\r\nMaltese\r\nMarshallese\r\nMartiniquais\r\nMauritanian\r\nMauritian\r\nMexican\r\nMicronesian\r\nMoldovan\r\nMonegasque\r\nMongolian\r\nMontenegrin\r\nMontserratian\r\nMoroccan\r\nMosotho\r\nMozambican\r\nNamibian\r\nNauruan\r\nNepalese\r\nNew Zealander\r\nNicaraguan\r\nNigerian\r\nNigerien\r\nNiuean\r\nNorth Korean\r\nNorthern Irish\r\nNorwegian\r\nOmani\r\nPakistani\r\nPalauan\r\nPalestinian\r\nPanamanian\r\nPapua New Guinean\r\nParaguayan\r\nPeruvian\r\nPitcairn Islander\r\nPolish\r\nPortuguese\r\nPrydeinig\r\nPuerto Rican\r\nQatari\r\nRomanian\r\nRussian\r\nRwandan\r\nSalvadorean\r\nSammarinese\r\nSamoan\r\nSao Tomean\r\nSaudi Arabian\r\nScottish\r\nSenegalese\r\nSerbian\r\nSierra Leonean\r\nSingaporean\r\nSlovak\r\nSlovenian\r\nSolomon Islander\r\nSomali\r\nSouth African\r\nSouth Korean\r\nSouth Sudanese\r\nSpanish\r\nSri Lankan\r\nSt Helenian\r\nSt Lucian\r\nStateless\r\nSudanese\r\nSurinamese\r\nSwazi\r\nSwedish\r\nSwiss\r\nSyrian\r\nTaiwanese\r\nTajik\r\nTanzanian\r\nThai\r\nTogolese\r\nTongan\r\nTrinidadian\r\nTristanian\r\nTunisian\r\nTurkish\r\nTurkmen\r\nTurks and Caicos Islander\r\nTuvaluan\r\nUgandan\r\nUkrainian\r\nUruguayan\r\nUzbek\r\nVatican citizen\r\nVenezuelan\r\nVietnamese\r\nVincentian\r\nWallisian\r\nWelsh\r\nYemeni\r\nZambian\r\nZimbabwean\r\n'
<class 'bytes'>

print(pd.read_csv(data))
print(type(pd.read_csv(data)))

Output:

Traceback (most recent call last):
  File "sdg/industry/gri/test.py", line 11, in <module>
    print(pd.read_csv(data))
  File "/home/danielbellhv/miniconda3/envs/sdg/lib/python3.8/site-packages/pandas/io/parsers.py", line 688, in read_csv
    return _read(filepath_or_buffer, kwds)
  File "/home/danielbellhv/miniconda3/envs/sdg/lib/python3.8/site-packages/pandas/io/parsers.py", line 454, in _read
    parser = TextFileReader(fp_or_buf, **kwds)
  File "/home/danielbellhv/miniconda3/envs/sdg/lib/python3.8/site-packages/pandas/io/parsers.py", line 948, in __init__
    self._make_engine(self.engine)
  File "/home/danielbellhv/miniconda3/envs/sdg/lib/python3.8/site-packages/pandas/io/parsers.py", line 1180, in _make_engine
    self._engine = CParserWrapper(self.f, **self.options)
  File "/home/danielbellhv/miniconda3/envs/sdg/lib/python3.8/site-packages/pandas/io/parsers.py", line 2010, in __init__
    self._reader = parsers.TextReader(src, **kwds)
  File "pandas/_libs/parsers.pyx", line 382, in pandas._libs.parsers.TextReader.__cinit__
  File "pandas/_libs/parsers.pyx", line 687, in pandas._libs.parsers.TextReader._setup_parser_source
OSError: Expected file path name or file-like object, got <class 'bytes'> type

Solution

  • import pandas as pd
    from sdg.datasource.MinioConn import MinioConn
    from io import StringIO
    
    def minio_download(filename):
        client = MinioConn().client()
        obj = client.get_object('project', f'foo/bar/{filename}')
        data = obj.data
    
        s = str(data,'utf-8')
        data = StringIO(s)
        df = pd.read_csv(data)
    
        return df
    
    df = minio_download('Citizenship.csv')
    print(df)
    print(type(df))
    

    Output:

        Citizenship
    0        Afghan
    1      Albanian
    2      Algerian
    3      American
    4      Andorran
    ..          ...
    220   Wallisian
    221       Welsh
    222      Yemeni
    223     Zambian
    224  Zimbabwean
    
    [225 rows x 1 columns]
    <class 'pandas.core.frame.DataFrame'>