Search code examples
pythonjsonparsinghtml-parser

convert links from html page to json format in python


I have an HTML page and I want to get the links from this page and then convert them into JSON format. This is the link to searchpage

Here is what I have tried.

class HtmltoJsonParser(HTMLParser):
    def __init__(self,raise_exception = True): 
        HTMLParser.__init__(self)
        #self.reset()
        self.doc = {}
        self.path = [] 
        self.cur = self.doc
        self.line = 0
        self.raise_exception = raise_exception
        
    @property
    def json(self):
        return self.doc
   
    @staticmethod
    def to_json(content, raise_exception = True):
        parser = HtmltoJsonParser(raise_exception = raise_exception)
        parser.feed(content)
        return parser.json
    
    def handle_starttag(self, tag, attrs):
        # Only parse the 'anchor' tag.
        if tag == "a":
            for name,link in attrs:
                if name == "href" and link.startswith("http"):
                    self.cur["" +name]= link
                    #print (link)

I took the help from this blog. I want to get an output like this

{
    "ads": [
      {
           "position": 1,
           "link": "https://www.googleadservices.com/pagead/aclk?sa=L&ai=DChcSEwitk5Ou2qX6AhVK07IKHdyyCwQYABADGgJscg&ohost=www.google.com&cid=CAASJeRoa3Q-GtJJqeqbQ0EjhhL22QNYj4Sg_79Man_cWa0tjzSi8Ho&sig=AOD64_3-qhJH4tfcxt1VMfxwOTF8BKeFXA&q&adurl&ved=2ahUKEwikz4uu2qX6AhVXAxAIHfwECwoQ0Qx6BAgFEAM",
    },
    {
         "position": 2,
         "link": "https://www.googleadservices.com/pagead/aclk?sa=L&ai=DChcSEwitk5Ou2qX6AhVK07IKHdyyCwQYABAAGgJscg&ohost=www.google.com&cid=CAASJeRoa3Q-GtJJqeqbQ0EjhhL22QNYj4Sg_79Man_cWa0tjzSi8Ho&sig=AOD64_1ZUcXQhcCFUYnBHo3jqlckXL2agg&q&adurl&ved=2ahUKEwikz4uu2qX6AhVXAxAIHfwECwoQ0Qx6BAgCEAE",
    }   ] }

but Im getting this

{'href': 'https://policies.google.com/terms?hl=en-PL&fg=1'}

Why is it not appending the link to JSON self.cur? I have tried appending it but I got key error every time.


Solution

  • the problem is here

    self.cur["" +name]= link
    

    as name=='href' is true, this will always update the value stored at name and not append it. try this.

        def handle_starttag(self, tag, attrs):
            # Only parse the 'anchor' tag.
            if tag == "a":
                for name, link in attrs:
                    print(attrs)
                    if name == "href" and link.startswith("http"):
                        cur = {}
                        cur["position"]= self.line
                        self.line += 1
                        cur["link"] = link 
                        self.doc["ads"].append(cur)
                        #print (link)
    

    but with what you have mentioned you should change link.startswith("http") to link.startswith("/url?q=")