Search code examples
pythonpostpython-requestsmultipartform-dataframes

Trying to post to a form that uses frames and retrieve the data using python


Thanks to Northcat and others, I was able to post a multipart/form-data request to http://www.camp.bicnirrh.res.in/featcalc/ using requests - and it worked like a charm. I'm now trying to POST data to http://pro-161-70.ib.unicamp.br/~itaraju/tools/pimw/ and select just the "Show pI/MW values" options. I'm uploading a file called Denovo. This is what I've been trying so far, trying to follow the same format that worked from my earlier posted question.

import requests
import urllib
session = requests.Session()
file={'file': (open('Bishop/Denovo.txt', 'r').read())}
url = 'http://pro-161-70.ib.unicamp.br/~itaraju/tools/pimw/pimw.htm'
payload = {"opShowpimw":"opShowpimw", "opUseTabs":"opUseTabs"}
raw = urllib.urlencode(payload)
response = session.post(url, files=file, data=payload)
print response.text

I'm using the url in the code rather than the one listed at the top because the website uses frames and returns 'This page uses frames, but your browser doesn\'t support them'. So I found the url above by viewing the 'view frame source'. The payload was from looking at ieheaders. The first one in payload corresponds to the "Show pI/MW values" and the second was a shot in the dark trying to make it easier by making it come out as text (on the form, click '.txt format). The response doesn't have values in it, and comes out looking like the first page. The url for the frame source on the results page is this 'http://pro-161-70.ib.unicamp.br/~itaraju/cgi-bin/itaraju/bioinf/pimw.cgi', however, using this yields no response.


Solution

  • I send sequence as text in tbSeq

    I found this sequence on http://pro-161-70.ib.unicamp.br/~itaraju/tools/pimw/what.htm

    It gives me some results and image (as below) saved od disk as 'output.gif'.

    import requests
    import lxml.html
    
    url = 'http://pro-161-70.ib.unicamp.br/~itaraju/cgi-bin/itaraju/bioinf/pimw.cgi'
    payload = {
        'arquivo': '',
        'opShowTitle': 'ON',
        'opShowSeq': 'ON',
        'opShowStat': 'ON',
        'opShowpimw': 'ON',
        'opGelVirtual': 'ON',
        'opMap': 'gel0.def',
        'opPK': 'Default',
        'tbCt': 3.55,
        'tbNt': 7,
        'tbArg': 12.01,
        'tbAsp': 4.06,
        'tbCys': 9,
        'tbGlu': 4.45,
        'tbHis': 5.985,
        'tbLys': 10.01,
        'tbTyr': 10.01,
        'tbSeq': '''>gi|532319|pir|TVFV2E|TVFV2E envelope protein
    ELRLRYCAPAGFALLKCNDADYDGFKTNCSNVSVVH
    CTNLMNTTVTTGLLLNGSYSENRTQIWQKHRTSNDS
    ALILLNKHYNLTVTCKRPGNKTVLPVTIMAGLVFHSQ
    KYNLRLRQAWCHFPSNWKGAWKEVKEEIVNLPKER
    YRGTNDPKRIFFQRQWGDPETANLWFNCHGEFFYCK
    MDWFLNYLNNLTVDADHNECKNTSGTKSGNKRAPG
    PCVQRTYVACHIRSVIIWLETISKKTYAPPREGHLECT
    STVTGMTVELNYIPKNRTNVTLSPQIESIWAAELDRY
    KLVEITPIGFAPTEVRRYTGGHERQKRVPFVXXXXXX
    XXXXXXXXXXXXXXXXVQSQHLLAGILQQQKNL
    LAAVEAQQQMLKLTIWGVK''',
    }
    
    # send POST    
    r = requests.post(url, data=payload)
    
    #print r.text
    
    # convert HTML string into HTML tree
    html = lxml.html.fromstring(r.text)
    
    # get all images
    imgs = html.cssselect('img')
    
    # get second image
    if len(imgs) > 1:
        url = 'http://pro-161-70.ib.unicamp.br/~itaraju/cgi-bin/itaraju/bioinf/' + imgs[1].attrib['src'].strip()
    
        print "Downloading ...",  url
    
        with open('output.gif', 'wb') as handle:
            r = requests.get(url, stream=True)
    
            if not r.ok:
                # Something went wrong
                pass
    
            for block in r.iter_content(1024):
                if not block:
                    break
    
                handle.write(block)
                print '.',
    
            print 
    
    # get data
    for tr in html.cssselect('tr'):
        for td in tr.cssselect('tr'):
            print td.text_content().strip().replace('\n', ' | '),
        print 
    

    Result:

    Downloading ... http://pro-161-70.ib.unicamp.br/~itaraju/cgi-bin/itaraju/bioinf/../../../tools/htdocs/tmp/gel.15548.gif
    . . . . . . . . . . . . . . . . . . . . . . . . . .
    
    
    ORF:
    gi|532319|pir|TVFV2E|TVFV2E envelope protein
    Sequence:
    ELRLRYCAPAGFALLKCNDADYDGFKTNCS NVSVVHCTNLMNTTVTTGLLLNGSYSENRT QIWQKHRTSNDSALILLNKHYNLTVTCKRP GNKTVLPVTIMAGLVFHSQKYNLRLRQAWC HFPSNWKGAWKEVKEEIVNLPKERYRGTND PKRIFFQRQWGDPETANLWFNCHGEFFYCK MDWFLNYLNNLTVDADHNECKNTSGTKSGN KRAPGPCVQRTYVACHIRSVIIWLETISKK TYAPPREGHLECTSTVTGMTVELNYIPKNR TNVTLSPQIESIWAAELDRYKLVEITPIGF APTEVRRYTGGHERQKRVPFVXXXXXXXXX XXXXXXXXXXXXXVQSQHLLAGILQQQKNL LAAVEAQQQMLKLTIWGVK
    MW: |       pI:
    40969.02 |  |   9.35
    Amino-acid composition
    Ala (A) | 20 | 5.3% |  | Cys (C) | 12 | 3.2% |  | Asp (D) | 10 | 2.6% |  | Glu (E) | 19 | 5.0% |  | Phe (F) | 12 | 3.2% |  | Gly (G) | 20 | 5.3% |  | His (H) | 11 | 2.9% |  | Ile (I) | 16 | 4.2% |  | Lys (K) | 24 | 6.3% |  | Leu (L) | 34 | 9.0% |  |    |  |  | Met (M) | 5 | 1.3% |  | Asn (N) | 27 | 7.1% |  | Pro (P) | 16 | 4.2% |  | Gln (Q) | 17 | 4.5% |  | Arg (R) | 21 | 5.5% |  | Ser (S) | 16 | 4.2% |  | Thr (T) | 30 | 7.9% |  | Val (V) | 24 | 6.3% |  | Trp (W) | 10 | 2.6% |  | Tyr (Y) | 13 | 3.4% Ala (A) | 20 | 5.3% Cys (C) | 12 | 3.2% Asp (D) | 10 | 2.6% Glu (E) | 19 | 5.0% Phe (F) | 12 | 3.2% Gly (G) | 20 | 5.3% His (H) | 11 | 2.9% Ile (I) | 16 | 4.2% Lys (K) | 24 | 6.3% Leu (L) | 34 | 9.0% Met (M) | 5 | 1.3% Asn (N) | 27 | 7.1% Pro (P) | 16 | 4.2% Gln (Q) | 17 | 4.5% Arg (R) | 21 | 5.5% Ser (S) | 16 | 4.2% Thr (T) | 30 | 7.9% Val (V) | 24 | 6.3% Trp (W) | 10 | 2.6% Tyr (Y) | 13 | 3.4%
    Ala (A) | 20 | 5.3%
    Cys (C) | 12 | 3.2%
    Asp (D) | 10 | 2.6%
    Glu (E) | 19 | 5.0%
    Phe (F) | 12 | 3.2%
    Gly (G) | 20 | 5.3%
    His (H) | 11 | 2.9%
    Ile (I) | 16 | 4.2%
    Lys (K) | 24 | 6.3%
    Leu (L) | 34 | 9.0%
    Met (M) | 5 | 1.3%
    Asn (N) | 27 | 7.1%
    Pro (P) | 16 | 4.2%
    Gln (Q) | 17 | 4.5%
    Arg (R) | 21 | 5.5%
    Ser (S) | 16 | 4.2%
    Thr (T) | 30 | 7.9%
    Val (V) | 24 | 6.3%
    Trp (W) | 10 | 2.6%
    Tyr (Y) | 13 | 3.4%
    Total:  | 379
    Theoretical 2D gel:
    

    Small red dot :)

    enter image description here


    EDIT: example with file - file have to be send in field named arquivo

    import requests
    import lxml.html
    
    url = 'http://pro-161-70.ib.unicamp.br/~itaraju/cgi-bin/itaraju/bioinf/pimw.cgi'
    payload = {
    #    'arquivo': '', # remove it
        'opShowTitle': 'ON',
        'opShowSeq': 'ON',
        'opShowStat': 'ON',
        'opShowpimw': 'ON',
        'opGelVirtual': 'ON',
        'opMap': 'gel0.def',
        'opPK': 'Default',
        'tbCt': 3.55,
        'tbNt': 7,
        'tbArg': 12.01,
        'tbAsp': 4.06,
        'tbCys': 9,
        'tbGlu': 4.45,
        'tbHis': 5.985,
        'tbLys': 10.01,
        'tbTyr': 10.01,
        'tbSeq': '',
    }
    
    files = {'arquivo': open('sequence.fasta').read()}
    
    #url = 'http://httpbin.org/post' # special portal for tests
    
    # send POST    
    r = requests.post(url, data=payload, files=files)
    
    #print r.text
    
    # convert HTML string into HTML tree
    html = lxml.html.fromstring(r.text)
    
    # get all images
    imgs = html.cssselect('img')
    
    # get second image
    if len(imgs) > 1:
        url = 'http://pro-161-70.ib.unicamp.br/~itaraju/cgi-bin/itaraju/bioinf/' + imgs[1].attrib['src'].strip()
    
        print "Downloading ...",  url
    
        with open('output.gif', 'wb') as handle:
            r = requests.get(url, stream=True)
    
            if not r.ok:
                # Something went wrong
                pass
    
            for block in r.iter_content(1024):
                if not block:
                    break
    
                handle.write(block)
                print '.',
    
            print 
    
    # get data
    for tr in html.cssselect('tr'):
        for td in tr.cssselect('tr'):
            print td.text_content().strip().replace('\n', ' | '),
        print 
    

    Used file sequence.fasta

    >gi|532319|pir|TVFV2E|TVFV2E envelope protein
    ELRLRYCAPAGFALLKCNDADYDGFKTNCSNVSVVH
    CTNLMNTTVTTGLLLNGSYSENRTQIWQKHRTSNDS
    ALILLNKHYNLTVTCKRPGNKTVLPVTIMAGLVFHSQ
    KYNLRLRQAWCHFPSNWKGAWKEVKEEIVNLPKER
    YRGTNDPKRIFFQRQWGDPETANLWFNCHGEFFYCK
    MDWFLNYLNNLTVDADHNECKNTSGTKSGNKRAPG
    PCVQRTYVACHIRSVIIWLETISKKTYAPPREGHLECT
    STVTGMTVELNYIPKNRTNVTLSPQIESIWAAELDRY
    KLVEITPIGFAPTEVRRYTGGHERQKRVPFVXXXXXX
    XXXXXXXXXXXXXXXXVQSQHLLAGILQQQKNL
    LAAVEAQQQMLKLTIWGVK