Search code examples
pythonhbasebigdatahappybase

Output separated HBase columns using happybase


I have such HBase-table:

total date1:tCount1 date2:tCount2 ...
url1 date1:clickCount1 date2:clickCount2 ...
url2 date1:clickCount1 date2:clickCount2 ...
...

url1, url2, ... are row keys. The table has only one column family.

I have a date range (from datei to datej) as input. I need to output shares of clicks in a day for each url.

The output must have the such format:

datei url1:share1 url2:share1...
...
datej url1:share1 url2:share1...

where

datei.url1:share1 = url1.datei:clickCount1 / total datei:tCount1

I started to write happybase-script, but I don't know, how to select separate columns from row using happybase. My happybase-script is below:

import argparse
import calendar
import getpass
import happybase
import logging
import random
import sys

USAGE = """

To query daily data for a year, run:
  $ {0} --action query --year 2014

To query daily data for a particular month, run:
  $ {0} --action query --year 2014 --month 10

To query daily data for a particular day, run:
  $ {0} --action query --year 2014 --month 10 --day 27

To compute totals add `--total` argument.

""".format(sys.argv[0])

logging.basicConfig(level="DEBUG")

HOSTS = ["bds%02d.vdi.mipt.ru" % i for i in xrange(7, 10)]
TABLE = "VisitCountPy-" + getpass.getuser()

def connect():
    host = random.choice(HOSTS)
    conn = happybase.Connection(host)

    logging.debug("Connecting to HBase Thrift Server on %s", host)
    conn.open()

    if TABLE not in conn.tables():
        # Create a table with column family `cf` with default settings.
        conn.create_table(TABLE, {"cf": dict()})
        logging.debug("Created table %s", TABLE)
    else:
        logging.debug("Using table %s", TABLE)
    return happybase.Table(TABLE, conn)

def query(args, table):
    r = list(get_time_range(args))
    t = 0L
    for key, data in table.scan(row_start=min(r), row_stop=max(r)):
        if args.total:
            t += long(data["cf:value"])
        else:
            print "%s\t%s" % (key, data["cf:value"])
    if args.total:
        print "total\t%s" % t

def get_time_range(args):
    cal = calendar.Calendar()
    years = [args.year]
    months = [args.month] if args.month is not None else range(1, 1+12)

    for year in years:
        for month in months:
            if args.day is not None:
                days = [args.day]
            else:
                days = cal.itermonthdays(year, month)
            for day in days:
                if day > 0:
                    yield "%04d%02d%02d" % (year, month, day)

def main():
    parser = argparse.ArgumentParser(description="An HBase example", usage=USAGE)
    parser.add_argument("--action", metavar="ACTION", choices=("generate", "query"), required=True)
    parser.add_argument("--year", type=int, required=True)
    parser.add_argument("--month", type=int, default=None)
    parser.add_argument("--day", type=int, default=None)
    parser.add_argument("--total", action="store_true", default=False)

    args = parser.parse_args()
    table = connect()

    if args.day is not None and args.month is None:
        raise RuntimeError("Please, specify a month when specifying a day.")
    if args.day is not None and (args.day < 0 or args.day > 31):
        raise RuntimeError("Please, specify a valid day.")

    query(args, table)

if __name__ == "__main__":
    main()

So, how should I change my script (actually, the query() function) to get the separated columns in the defined date range?


Solution

  • I think you should use a scanner filter, which you can provide as a string (which will be interpreted at the server) through the scan(filter=...) argument.

    See https://github.com/wbolster/happybase/issues/11 for some pointers (examples, docs).