How To Construct re.findall Regex In Python To Capture Youtube Timestamp

Script

from __future__ import unicode_literals
import youtube_dl
import pandas as pd
import csv
import re

# Initialize YouTube-DL Array
ydl_opts = {}

# read the csv file
number_of_rows = pd.read_csv('single.csv')

# Scrape Online Product
def run_scraper():
    
    # Read CSV to List
    with open("single.csv", "r") as f:
        csv_reader = csv.reader(f)
        next(csv_reader)

        # Scrape Data From Store
        for csv_line_entry in csv_reader:
                        
            with youtube_dl.YoutubeDL(ydl_opts) as ydl:
                meta = ydl.extract_info(csv_line_entry[0], download=False)
                description = meta['description']
                #print('Description    :', description)

                # Function to Capture Timestamp Descriptions
                get_links(description)
                

def get_links(description):

  # Format: Timestamp + Text
  description_text = re.findall(r'(\d{2}:\d{2}?.*)', description)
  print(description_text)
  print()

  # Format: Text + Timestamp
  description_text1 = re.findall(r'(.*\d{2}:\d{2}?)', description)
  print(description_text1)

run_scraper()

CSV File

Videos, Format
https://www.youtube.com/watch?v=kqtD5dpn9C8, Format: Timestamp + Text
https://www.youtube.com/watch?v=pJ3IPRqiD2M, Format: Text + Timestamp
https://www.youtube.com/watch?v=rfscVS0vtbw, No Regex in code
https://www.youtube.com/watch?v=t8pPdKYpowI, No Regex in code

My script pulls YouTube Urls from a CSV file in readiness to capture general YouTube description information such as intro, links, timestamps, etc.

I want to capture the YouTube Timestamp Descriptions only as highlighted in the image below:

I understand that YouTube timestamp formatting is not consistent and for that reason I have included a few examples in the CSV file.

In my function get_links, I have partially managed to extract Timestamp + Text and Text + Timestamp for 2 of the 4 CSV Urls listed.

I need a way to only show the Text or description part of the timestamp irrespective of the formatting type shown in all 4 CSV Urls.

Any help would be much appreciated.

Solution

Try:

import youtube_dl
import pandas as pd
import csv
import re

# Initialize YouTube-DL Array
ydl_opts = {}

r_pat = re.compile(r"\d+:\d+")
r_pat2 = re.compile(r"[^A-Za-z]*\d+:\d+:?\d*?[^A-Za-z]*")

# Scrape Online Product
def run_scraper():

    # Read CSV to List
    with open("single.csv", "r") as f:
        csv_reader = csv.reader(f)
        next(csv_reader)

        # Scrape Data From Store
        for csv_line_entry in csv_reader:
            with youtube_dl.YoutubeDL(ydl_opts) as ydl:
                meta = ydl.extract_info(csv_line_entry[0], download=False)
                description = meta["description"]
                out = get_links(description)
                print(*out, sep="\n")
                print("-" * 80)


def get_links(description):
    rv = []
    for line in description.splitlines():
        if r_pat.search(line):
            rv.append(r_pat2.sub("", line))
    return rv


run_scraper()

Prints:

[youtube] kqtD5dpn9C8: Downloading webpage
Introduction 
What You Can Do With Python 
Your First Python Program 
Variables
Receiving Input
Type Conversion
Strings
Arithmetic Operators 
Operator Precedence 
Comparison Operators 
Logical Operators
If Statements
Exercise
While Loops
Lists
List Methods
For Loops
The range() Function 
Tuples
--------------------------------------------------------------------------------
[youtube] pJ3IPRqiD2M: Downloading webpage
Python Course
What is Python
Why choose Python
Features of Python
Applications of Python
Salary Trends
Quiz
Installing Python
Python Variable
Python Tokens


...and so on.