python post python-requests multipartform-data frames

Trying to post to a form that uses frames and retrieve the data using python

Thanks to Northcat and others, I was able to post a multipart/form-data request to http://www.camp.bicnirrh.res.in/featcalc/ using requests - and it worked like a charm. I'm now trying to POST data to http://pro-161-70.ib.unicamp.br/~itaraju/tools/pimw/ and select just the "Show pI/MW values" options. I'm uploading a file called Denovo. This is what I've been trying so far, trying to follow the same format that worked from my earlier posted question.

import requests
import urllib
session = requests.Session()
file={'file': (open('Bishop/Denovo.txt', 'r').read())}
url = 'http://pro-161-70.ib.unicamp.br/~itaraju/tools/pimw/pimw.htm'
payload = {"opShowpimw":"opShowpimw", "opUseTabs":"opUseTabs"}
raw = urllib.urlencode(payload)
response = session.post(url, files=file, data=payload)
print response.text

I'm using the url in the code rather than the one listed at the top because the website uses frames and returns 'This page uses frames, but your browser doesn\'t support them'. So I found the url above by viewing the 'view frame source'. The payload was from looking at ieheaders. The first one in payload corresponds to the "Show pI/MW values" and the second was a shot in the dark trying to make it easier by making it come out as text (on the form, click '.txt format). The response doesn't have values in it, and comes out looking like the first page. The url for the frame source on the results page is this 'http://pro-161-70.ib.unicamp.br/~itaraju/cgi-bin/itaraju/bioinf/pimw.cgi', however, using this yields no response.

Solution

I send sequence as text in tbSeq

I found this sequence on http://pro-161-70.ib.unicamp.br/~itaraju/tools/pimw/what.htm

It gives me some results and image (as below) saved od disk as 'output.gif'.

import requests
import lxml.html

url = 'http://pro-161-70.ib.unicamp.br/~itaraju/cgi-bin/itaraju/bioinf/pimw.cgi'
payload = {
    'arquivo': '',
    'opShowTitle': 'ON',
    'opShowSeq': 'ON',
    'opShowStat': 'ON',
    'opShowpimw': 'ON',
    'opGelVirtual': 'ON',
    'opMap': 'gel0.def',
    'opPK': 'Default',
    'tbCt': 3.55,
    'tbNt': 7,
    'tbArg': 12.01,
    'tbAsp': 4.06,
    'tbCys': 9,
    'tbGlu': 4.45,
    'tbHis': 5.985,
    'tbLys': 10.01,
    'tbTyr': 10.01,
    'tbSeq': '''>gi|532319|pir|TVFV2E|TVFV2E envelope protein
ELRLRYCAPAGFALLKCNDADYDGFKTNCSNVSVVH
CTNLMNTTVTTGLLLNGSYSENRTQIWQKHRTSNDS
ALILLNKHYNLTVTCKRPGNKTVLPVTIMAGLVFHSQ
KYNLRLRQAWCHFPSNWKGAWKEVKEEIVNLPKER
YRGTNDPKRIFFQRQWGDPETANLWFNCHGEFFYCK
MDWFLNYLNNLTVDADHNECKNTSGTKSGNKRAPG
PCVQRTYVACHIRSVIIWLETISKKTYAPPREGHLECT
STVTGMTVELNYIPKNRTNVTLSPQIESIWAAELDRY
KLVEITPIGFAPTEVRRYTGGHERQKRVPFVXXXXXX
XXXXXXXXXXXXXXXXVQSQHLLAGILQQQKNL
LAAVEAQQQMLKLTIWGVK''',
}

# send POST    
r = requests.post(url, data=payload)

#print r.text

# convert HTML string into HTML tree
html = lxml.html.fromstring(r.text)

# get all images
imgs = html.cssselect('img')

# get second image
if len(imgs) > 1:
    url = 'http://pro-161-70.ib.unicamp.br/~itaraju/cgi-bin/itaraju/bioinf/' + imgs[1].attrib['src'].strip()

    print "Downloading ...",  url

    with open('output.gif', 'wb') as handle:
        r = requests.get(url, stream=True)

        if not r.ok:
            # Something went wrong
            pass

        for block in r.iter_content(1024):
            if not block:
                break

            handle.write(block)
            print '.',

        print 

# get data
for tr in html.cssselect('tr'):
    for td in tr.cssselect('tr'):
        print td.text_content().strip().replace('\n', ' | '),
    print

Result:

Downloading ... http://pro-161-70.ib.unicamp.br/~itaraju/cgi-bin/itaraju/bioinf/../../../tools/htdocs/tmp/gel.15548.gif
. . . . . . . . . . . . . . . . . . . . . . . . . .


ORF:
gi|532319|pir|TVFV2E|TVFV2E envelope protein
Sequence:
ELRLRYCAPAGFALLKCNDADYDGFKTNCS NVSVVHCTNLMNTTVTTGLLLNGSYSENRT QIWQKHRTSNDSALILLNKHYNLTVTCKRP GNKTVLPVTIMAGLVFHSQKYNLRLRQAWC HFPSNWKGAWKEVKEEIVNLPKERYRGTND PKRIFFQRQWGDPETANLWFNCHGEFFYCK MDWFLNYLNNLTVDADHNECKNTSGTKSGN KRAPGPCVQRTYVACHIRSVIIWLETISKK TYAPPREGHLECTSTVTGMTVELNYIPKNR TNVTLSPQIESIWAAELDRYKLVEITPIGF APTEVRRYTGGHERQKRVPFVXXXXXXXXX XXXXXXXXXXXXXVQSQHLLAGILQQQKNL LAAVEAQQQMLKLTIWGVK
MW: |       pI:
40969.02 |  |   9.35
Amino-acid composition
Ala (A) | 20 | 5.3% |  | Cys (C) | 12 | 3.2% |  | Asp (D) | 10 | 2.6% |  | Glu (E) | 19 | 5.0% |  | Phe (F) | 12 | 3.2% |  | Gly (G) | 20 | 5.3% |  | His (H) | 11 | 2.9% |  | Ile (I) | 16 | 4.2% |  | Lys (K) | 24 | 6.3% |  | Leu (L) | 34 | 9.0% |  |    |  |  | Met (M) | 5 | 1.3% |  | Asn (N) | 27 | 7.1% |  | Pro (P) | 16 | 4.2% |  | Gln (Q) | 17 | 4.5% |  | Arg (R) | 21 | 5.5% |  | Ser (S) | 16 | 4.2% |  | Thr (T) | 30 | 7.9% |  | Val (V) | 24 | 6.3% |  | Trp (W) | 10 | 2.6% |  | Tyr (Y) | 13 | 3.4% Ala (A) | 20 | 5.3% Cys (C) | 12 | 3.2% Asp (D) | 10 | 2.6% Glu (E) | 19 | 5.0% Phe (F) | 12 | 3.2% Gly (G) | 20 | 5.3% His (H) | 11 | 2.9% Ile (I) | 16 | 4.2% Lys (K) | 24 | 6.3% Leu (L) | 34 | 9.0% Met (M) | 5 | 1.3% Asn (N) | 27 | 7.1% Pro (P) | 16 | 4.2% Gln (Q) | 17 | 4.5% Arg (R) | 21 | 5.5% Ser (S) | 16 | 4.2% Thr (T) | 30 | 7.9% Val (V) | 24 | 6.3% Trp (W) | 10 | 2.6% Tyr (Y) | 13 | 3.4%
Ala (A) | 20 | 5.3%
Cys (C) | 12 | 3.2%
Asp (D) | 10 | 2.6%
Glu (E) | 19 | 5.0%
Phe (F) | 12 | 3.2%
Gly (G) | 20 | 5.3%
His (H) | 11 | 2.9%
Ile (I) | 16 | 4.2%
Lys (K) | 24 | 6.3%
Leu (L) | 34 | 9.0%
Met (M) | 5 | 1.3%
Asn (N) | 27 | 7.1%
Pro (P) | 16 | 4.2%
Gln (Q) | 17 | 4.5%
Arg (R) | 21 | 5.5%
Ser (S) | 16 | 4.2%
Thr (T) | 30 | 7.9%
Val (V) | 24 | 6.3%
Trp (W) | 10 | 2.6%
Tyr (Y) | 13 | 3.4%
Total:  | 379
Theoretical 2D gel:

Small red dot :)

enter image description here

EDIT: example with file - file have to be send in field named arquivo

import requests
import lxml.html

url = 'http://pro-161-70.ib.unicamp.br/~itaraju/cgi-bin/itaraju/bioinf/pimw.cgi'
payload = {
#    'arquivo': '', # remove it
    'opShowTitle': 'ON',
    'opShowSeq': 'ON',
    'opShowStat': 'ON',
    'opShowpimw': 'ON',
    'opGelVirtual': 'ON',
    'opMap': 'gel0.def',
    'opPK': 'Default',
    'tbCt': 3.55,
    'tbNt': 7,
    'tbArg': 12.01,
    'tbAsp': 4.06,
    'tbCys': 9,
    'tbGlu': 4.45,
    'tbHis': 5.985,
    'tbLys': 10.01,
    'tbTyr': 10.01,
    'tbSeq': '',
}

files = {'arquivo': open('sequence.fasta').read()}

#url = 'http://httpbin.org/post' # special portal for tests

# send POST    
r = requests.post(url, data=payload, files=files)

#print r.text

# convert HTML string into HTML tree
html = lxml.html.fromstring(r.text)

# get all images
imgs = html.cssselect('img')

# get second image
if len(imgs) > 1:
    url = 'http://pro-161-70.ib.unicamp.br/~itaraju/cgi-bin/itaraju/bioinf/' + imgs[1].attrib['src'].strip()

    print "Downloading ...",  url

    with open('output.gif', 'wb') as handle:
        r = requests.get(url, stream=True)

        if not r.ok:
            # Something went wrong
            pass

        for block in r.iter_content(1024):
            if not block:
                break

            handle.write(block)
            print '.',

        print 

# get data
for tr in html.cssselect('tr'):
    for td in tr.cssselect('tr'):
        print td.text_content().strip().replace('\n', ' | '),
    print

Used file sequence.fasta

>gi|532319|pir|TVFV2E|TVFV2E envelope protein
ELRLRYCAPAGFALLKCNDADYDGFKTNCSNVSVVH
CTNLMNTTVTTGLLLNGSYSENRTQIWQKHRTSNDS
ALILLNKHYNLTVTCKRPGNKTVLPVTIMAGLVFHSQ
KYNLRLRQAWCHFPSNWKGAWKEVKEEIVNLPKER
YRGTNDPKRIFFQRQWGDPETANLWFNCHGEFFYCK
MDWFLNYLNNLTVDADHNECKNTSGTKSGNKRAPG
PCVQRTYVACHIRSVIIWLETISKKTYAPPREGHLECT
STVTGMTVELNYIPKNRTNVTLSPQIESIWAAELDRY
KLVEITPIGFAPTEVRRYTGGHERQKRVPFVXXXXXX
XXXXXXXXXXXXXXXXVQSQHLLAGILQQQKNL
LAAVEAQQQMLKLTIWGVK