Search code examples
pythonajaxyoutubecommentslxml

How to speed up Ajax requests Python Youtube scraper


I'm editing on a simple scraper that crawls a Youtube video's comment page. The crawler uses Ajax to go through every comment on a Youtube Videos comment page and then saves them to a json file. Even with small number of comments (< 10), it still takes 3+ min for the comments to be parsed.

I've tried including request-cache and using ujson instead of json to see if there are any benefits but there's no noticeable difference.

Here's the code I'm using currently:

import os
import sys
import time
import ujson
import requests
import requests_cache
import argparse
import lxml.html

requests_cache.install_cache('comment_cache')

from lxml.cssselect import CSSSelector

YOUTUBE_COMMENTS_URL = 'https://www.youtube.com/all_comments?v={youtube_id}'
YOUTUBE_COMMENTS_AJAX_URL = 'https://www.youtube.com/comment_ajax'


def find_value(html, key, num_chars=2):
    pos_begin = html.find(key) + len(key) + num_chars
    pos_end = html.find('"', pos_begin)
    return html[pos_begin: pos_end]


def extract_comments(html):
    tree = lxml.html.fromstring(html)
    item_sel = CSSSelector('.comment-item')
    text_sel = CSSSelector('.comment-text-content')
    photo_sel = CSSSelector('.user-photo')


    for item in item_sel(tree):
        yield {'cid': item.get('data-cid'),
               'name': item.get('data-name'),
               'ytid': item.get('data-aid'),
               'text': text_sel(item)[0].text_content(),
               'photo': photo_sel(item)[0].get('src')}


def extract_reply_cids(html):
    tree = lxml.html.fromstring(html)
    sel = CSSSelector('.comment-replies-header > .load-comments')
    return [i.get('data-cid') for i in sel(tree)]


def ajax_request(session, url, params, data, retries=10, sleep=20):
    for _ in range(retries):
        response = session.post(url, params=params, data=data)
        if response.status_code == 200:
            response_dict = ujson.loads(response.text)
            return response_dict.get('page_token', None), response_dict['html_content']
        else:
            time.sleep(sleep)


def download_comments(youtube_id, sleep=1, order_by_time=True):
    session = requests.Session()

    # Get Youtube page with initial comments
    response = session.get(YOUTUBE_COMMENTS_URL.format(youtube_id=youtube_id))
    html = response.text
    reply_cids = extract_reply_cids(html)

    ret_cids = []
    for comment in extract_comments(html):
        ret_cids.append(comment['cid'])
        yield comment

    page_token = find_value(html, 'data-token')
    session_token = find_value(html, 'XSRF_TOKEN', 4)

    first_iteration = True

    # Get remaining comments (the same as pressing the 'Show more' button)
    while page_token:
        data = {'video_id': youtube_id,
                'session_token': session_token}

        params = {'action_load_comments': 1,
                  'order_by_time': order_by_time,
                  'filter': youtube_id}

        if order_by_time and first_iteration:
            params['order_menu'] = True
        else:
            data['page_token'] = page_token

        response = ajax_request(session, YOUTUBE_COMMENTS_AJAX_URL, params, data)
        if not response:
            break

        page_token, html = response

        reply_cids += extract_reply_cids(html)
        for comment in extract_comments(html):
            if comment['cid'] not in ret_cids:
                ret_cids.append(comment['cid'])
                yield comment

        first_iteration = False
        time.sleep(sleep)

    # Get replies (the same as pressing the 'View all X replies' link)
    for cid in reply_cids:
        data = {'comment_id': cid,
                'video_id': youtube_id,
                'can_reply': 1,
                'session_token': session_token}

        params = {'action_load_replies': 1,
                  'order_by_time': order_by_time,
                  'filter': youtube_id,
                  'tab': 'inbox'}

        response = ajax_request(session, YOUTUBE_COMMENTS_AJAX_URL, params, data)
        if not response:
            break

        _, html = response

        for comment in extract_comments(html):
            if comment['cid'] not in ret_cids:
                ret_cids.append(comment['cid'])
                yield comment
        time.sleep(sleep)


def main(argv):
    parser = argparse.ArgumentParser(add_help=False, description=('Download Youtube comments without using the Youtube API'))
    parser.add_argument('--help', '-h', action='help', default=argparse.SUPPRESS, help='Show this help message and exit')
    parser.add_argument('--youtubeid', '-y', help='ID of Youtube video for which to download the comments')
    parser.add_argument('--output', '-o', help='Output filename (output format is line delimited JSON)')
    parser.add_argument('--timeorder', '-t', action='store_true', help='Download Youtube comments ordered by time')

    try:
        args = parser.parse_args(argv)

        youtube_id = args.youtubeid
        output = args.output

        start_time = time.time()

        if not youtube_id or not output:
            parser.print_usage()
            raise ValueError('you need to specify a Youtube ID and an output filename')

        print 'Downloading Youtube comments for video:', youtube_id
        count = 0
        with open(output, 'wb') as fp:
            for comment in download_comments(youtube_id, order_by_time=bool(args.timeorder)):
                print >> fp, ujson.dumps(comment, escape_forward_slashes=False)
                count += 1
                sys.stdout.write('Downloaded %d comment(s)\r' % count)
                sys.stdout.flush()

        elapsed_time = time.time() - start_time

        print '\nDone! Elapsed time (seconds):', elapsed_time


    except Exception, e:
        print 'Error:', str(e)
        sys.exit(1)


if __name__ == "__main__":
    main(sys.argv[1:])

I'm new to Python so I'm not sure where the bottlenecks are. The finished script will be used to parse through 100,000+ comments so performance is a large factor.

  • Would using multithreading solve the issue? And if so how would I refactor this to benefit from it?
  • Is this strictly a network issue?

Solution

    • Yes, Multithreading will speed up the process. Run the network operations (ie. downloading) in a separate Thread.
    • Yes, it is a network related issue.

    Your requests are I/O bound. You make a request to Youtube - it takes some time to get back the response, it's dependent mostly on the network, you can't make the process faster. However, you can use Threads to send multiple requests in parallel. That will not make the actual process faster but you will process more in less time.

    Threading tutorial:

    An example somewhat similar to your task -- http://www.toptal.com/python/beginners-guide-to-concurrency-and-parallelism-in-python

    Also since you will be doing a lot of scraping and processing, I would recommend using something like Scrapy - I personally use it for these kind of tasks.