Search code examples
pythonpython-3.xweb-scrapingdigg

Scraped images is corrupt


Hi I try to scrape the front page images on digg.com, with the follow code. The issue is that 0.jpg to 6.jpg are normal. Starting at 7.jpg to 47.jpg are corrupt. Not sure why.

Here is the code. Github here: https://github.com/kenpeter/py_mm

# os
import os
# http request
import requests
#
import pprint

import time

# import html from lxml
from lxml import html

# global
global_page_num = 0
pp = pprint.PrettyPrinter(indent=4)

# write to file
def download_image(img_urls):
    # total img urls
    amount = len(img_urls)

    # loop
    for index, value in enumerate(img_urls, start=0):
        # file name
        filename = 'img/%s.jpg' % (index)
        # dir
        os.makedirs(os.path.dirname(filename), exist_ok=True)

        print('--- start ---')
        print('filename: %s' % filename)
        print('Downloading: %s out of %s' % (index, amount))

        # open file
        with open(filename, 'wb') as f:
            # f write
            # time.sleep(1)
            f.write(requests.get(value).content)


def get_page_number(num):
    url = 'http://digg.com'
    response = requests.get(url).content
    selector = html.fromstring(response)

    img_urls = []
    img_urls = selector.xpath("//div[@class='digg-story__image--thumb']/a/img/@src")

    news_texts = []
    news_texts = selector.xpath("//div[@itemprop='description']/text()")

    # test
    # print('--- something ---')
    # pp.pprint(img_urls)
    # pp.pprint(news_texts)

    download_image(img_urls)

    return img_urls


if __name__ == '__main__':
    # input, page_number, everything into the var
    # page_number = input('Please enter the page number that you want to scrape:')

    # global_page_num
    # global_page_num = page_number;
    # print('hell world!');

    page_number = 4 # hardcode
    get_page_number(page_number)

Solution

  • The reason why the images are "corrupt" is that the scheme changes within the page and the images start to "hide" in the attribute data-src instead of src which content you grab with your code. See here an example of the source code of the grabbed page with both attributes:

    <img
    class="digg-story__image-img js--digg-story__image-img lazy-image-img need-offset"
    data-src="http://static.digg.com/images/f0b92c2d8a2c4b7f829abbc0e58a408c_2oijd0Z_1_www_large_thumb.jpeg"
    src="http://static.digg.com/static/fe/944294/images/x_455x248.png"
    width="312"
    height="170"
    alt=""
    />
    

    In other words you have to check for both attributes src and data-src giving data-src priority over src while creating the list of image URLs.

    THIS code does the "trick" and downloads the proper images:

    # os
    import os
    # http request
    import requests
    #
    import pprint
    
    import time
    
    # import html from lxml
    from lxml import html
    
    # global
    global_page_num = 0
    pp = pprint.PrettyPrinter(indent=4)
    
    # write to file
    def download_image(img_urls):
        # total img urls
        amount = len(img_urls)
    
        # loop
        for index, value in enumerate(img_urls, start=0):
            # file name
            filename = 'img/%s.jpg' % (index)
            # dir
            os.makedirs(os.path.dirname(filename), exist_ok=True)
    
            print('--- start ---')
            print('filename: %s' % filename)
            print('Downloading: %s out of %s' % (index, amount))
    
            # open file
            with open(filename, 'wb') as f:
                # f write
                # time.sleep(1)
                f.write(requests.get(value).content)
    
    
    def get_page_number(num):
        url = 'http://digg.com'
        response = requests.get(url).content
        selector = html.fromstring(response)
    
        img_urls = []
        img_urls_1a = selector.xpath("//div[@class='digg-story__image--thumb']/a/img/@src")
        img_urls_1b = [item for item in img_urls_1a if 'x_455x248.png' not in item]
        img_urls_2 = selector.xpath("//div[@class='digg-story__image--thumb']/a/img/@data-src")
        img_urls = img_urls_1b + img_urls_2
        # print(img_urls)
        news_texts = []
        news_texts = selector.xpath("//div[@itemprop='description']/text()")
    
        # test
        # print('--- something ---')
        # pp.pprint(img_urls)
        # pp.pprint(news_texts)
    
        download_image(img_urls)
    
        return img_urls
    
    
    if __name__ == '__main__':
        # input, page_number, everything into the var
        # page_number = input('Please enter the page number that you want to scrape:')
    
        # global_page_num
        # global_page_num = page_number;
        # print('hell world!');
    
        page_number = 4 # hardcode
        get_page_number(page_number)