Monday, February 27, 2006

URL to DSL: Turning JobJitsu's URLs into a job query language.

Useful URL's (KIHF)
URL to DSL: Turning JobJitsu's URLs into a job query language.

One of the goals behind JobJitsu is to uphold the KISS (Keep It Simple Stupid) philosophy and where ever possible uphold the KIHF (Keep It Hacker Friendly) philosophy. That is especially true for URL's. One thing I really like about del.icio.us is their site navigation. It is pretty simple, and at the same time very expressive/powerful. That being said we decided to make our URL's useful for searching and navigation.

For example:
When searching for python software engineering jobs in San Diego, one would use a url like...
  http://jobjitsu.com/jobs/sandiego/sw/python
The structure of the url is:
  http://jobjitsu.com/jobs/locations/categories/tags

where
  locations -> comma separated list of locations
  categories - > comma separated list of categories
  tags -> list of tags and operators
  operators -> ,+!
    , := or
    + := and
    ! := not

A more complex example:
Show me all the jobs in San Diego or Los Angeles that are in the software engineering or quality assurance categories having the tags python and lisp and linux but not microsoft...
  http://jobjitsu.com/jobs/sandiego,losangeles/sw,qa/+python+lisp+linux!microsoft

But wait there is more...
By adding the feed sub-domain to your url you'll get a jobs atom feed:
  http://jobjitsu.com/jobs/sandiego/sw/python
gives you something that looks good in a browser, and
  http://feed.jobjitsu.com/jobs/sandiego/sw/python
will give you something that looks good in your feed reader. This allows you to easily syndicate your job searches. So if you always want to see the latest lisp jobs you can now embed:
  http://feed.jobjitsu.com/jobs/all/all/lisp
into your google/ig homepage.

Now for the fun stuff... how do we make it work?
We first built a url library to handle most of the low level url parsing which allows us to build a parse tree of operators and tokens. we then created the sql-generation code that allowed us to translate this parse tree into sql statements and finally we wrapped up the whole thing w/ some query caching. Disclaimer: this is all still first pass and is subject to bugs and change but I figured I would post it cause it is kind of neat.

The Sql-Generation is not included cause it is quite hackish (even more so then the rest of my code) and will need to be improved...

Job handler...


from ats.cache import memcache_mgr
from ats.db import dao
from ats.log import logger
from ats.util import converter
from ats.util import url as url_lib
from common.model import job as job_model
from common.model import sqlgen
from common.ui import navigation
from home.view.jobs import jobs as jobtmpl
from home.view.sidebar import sidebar
from mod_python import apache

# ------------------------------------------------------------------
# globals
# ------------------------------------------------------------------
cache = memcache_mgr.get_cache()
log = logger.get_logger()

# ------------------------------------------------------------------
# handler
# ------------------------------------------------------------------
def handler(req, sesh):
""" This function will return all the jobs based on the query string
"""
global log
jobs, start, pagesize, error = [], 0, 0, 0
try:
jobs, start, pagesize = get_jobs(req.uri)
jobs = sort_jobs(jobs)
if jobs and start > 0:
jobs = jobs[start-1:pagesize]
except Exception, e:
log.exception(e)
error = 1

if error:
# do some error handling here
pass

return jobtmpl(searchList=[{
"site_nav": navigation.get_site_nav('jobs'),
"sesh": sesh,
"jobs": jobs,
"start": start,
"pagesize": pagesize,
"sidebar":
sidebar(searchList=[{"sesh":sesh,"metros": job_model.get_metros()}])}])

# ------------------------------------------------------------------
# get jobs from cache or db
# ------------------------------------------------------------------
def get_jobs(url):
""" Check the cache to see if this query has been run recently
if so then return those results.
On Cache miss we construct the query and go to the database
"""
global cache
if not url: return [], 0, 0
urlkey = 'jj:jobs:%s' % url
segments = url_lib.unpack(
url,
['locations', 'categories', 'tags', 'start', 'pagesize'],
keep_blank_values=0)
start = converter.toint(segments.get('start', 1))
pagesize = converter.toint(segments.get('pagesize', 50))

# check cache first
jobs = cache.get(urlkey)
if jobs:
return jobs, start, pagesize

# on cache miss parse query string and run query
locations = filter( # split on ',' + filter empties and 'all'
lambda x: x and x != 'all',
segments.get('locations', '').lower().split(','))
categories = filter( # split on ',' + filter empties and 'all'
lambda x: x and x != 'all',
segments.get('categories', '').lower().split(','))
tags = url_lib.process(segments.get('tags', 'all').lower())

# run query
ct, jobs = dao.get_records(
'jobjitsu',
construct_sql(locations, categories, tags))

# cache for 5 min
if ct > 0:
cache.set(urlkey, jobs, 5)
else:
jobs, start, pagesize = [], 0, 0

return jobs, start, pagesize

# ------------------------------------------------------------------
# translate query string to job query
# ------------------------------------------------------------------
def construct_sql(locations, categories, tags):
""" build sql stmt from sections of query string of the form:
/jobs/locations/categories/tags/start/pagesize
returns (stmt, hash(url), start, pagesize)
"""
global log

# construct the query
sql = []
if locations:
sql.append(sqlgen.job_loc_or(locations))
if categories:
sql.append(sqlgen.job_cat_or(categories))
if tags and tags[0]:
sql.append(sqlgen.job_tags_and(tags[0]))
if tags and tags[1]:
sql.append(sqlgen.job_tags_or(tags[1]))
if tags and tags[2]:
sql.append(sqlgen.job_tags_not(tags[2]))
stmt = "%s%s;" % (" select j.* from jb_jobs j where "," \nand ".join(sql))
return stmt

# ------------------------------------------------------------------
# utility
# ------------------------------------------------------------------
def sort_jobs(jobs):
if not jobs: return []
sorted = jobs[:]
sorted.sort(lambda x,y: cmp(y.createdt, x.createdt))
return sorted




Low Level URI handling...


from ats.db import dao

escape_char = "'"
op_and = '+'
op_or = ','
op_not = '!'
operators = (op_and, op_or, op_not)

def unpack(uri, keys, keep_blank_values=0):
""" This function will take a req.uri and a list and return a dict
of key->val pairs where the keys are taken from the list and
values are taken from the req.uri.split('/')
"""
d = {}
if keep_blank_values:
for key in keys:
d[key] = None
if not uri or not keys:
return d
try:
num_keys = len(keys)
tokens = uri.split('/')[2:]
for i in xrange(len(tokens)):
if i < num_keys and tokens[i]:
d[keys[i]] = tokens[i]
return d
except:
return d

def process(str):
global log
if not str or str == 'all': return None
try:
return simplify_qry_expr(classify_qry_tokens(tokenize(str)))
except Exception, e:
log.exception(e)
return None

def tokenize(str):
""" takes a string like 'red,green+blue' and returns a list
in [OPER1, TOKEN1, OPER2, TOKEN2, ...] formant e.g.:
[',', 'red', ',', 'green', '+', 'blue']
this list can then be later operated on and transformed into
a sql query by examining the operators and tokens
"""
global operators, op_or, escape_char
tokens = []
word = []
i = 0
while i < len(str):
#########################################################
### save this section for when we impl group'd exprs ###
#########################################################
# if str[i] == '(':
# jmp = match_brace(str[i:])
# tokens.append(tokenize(str[i+1:i+jmp]))
# i += jmp+1
# continue
# if str[i] == ')':
# if word:
# tokens.append("".join(word))
# if tokens and tokens[0] not in operators:
# return [op_or] + tokens
# return tokens
#########################################################
if str[i] == escape_char:
i+=1
word.append(str[i])
i+=1
continue
elif str[i] in operators:
if word:
tokens.append(dao.escape("".join(word)))
word = []
tokens.append(str[i])
else:
word.append(str[i])
i += 1
if word:
tokens.append(dao.escape("".join(word)))
if tokens and tokens[0] not in operators:
return [op_or] + tokens
return tokens

def match_brace(str):
""" Given a string '(hello (not) cool) world' will return 17
useful for (grouped expressions) -- coming soon
"""
brc_count = 1
length = len(str)
for i in xrange(length-1):
if str[i+1] == '(':
brc_count += 1
elif str[i+1] == ')':
brc_count -= 1
if brc_count == 0:
return i + 1
if str[length-1] == ')':
return length-1
return 0

def classify_qry_tokens(tokens):
""" Takes a flat list of the form op, word, op, word, ...
returns a tuple of lists representing AND-tokens, OR-tokens, NOT-tokens
"""
global operators, op_and, op_or, op_not
_and, _or, _not = [], [], []
coppied = tokens[:]
while coppied:
token = coppied.pop()
oper = coppied.pop()
if isinstance(token, list):
token = classify(token)
if oper == op_and: _and.append(token)
elif oper == op_or: _or.append(token)
elif oper == op_not: _not.append(token)
return _and, _or, _not

def simplify_qry_expr(expr):
""" OR is lowest level operator
it only matters if AND and NOT aren't present.
* AND + OR reduces to just AND..
* NOT + OR reduces to just NOT...
"""
if expr[0] or expr[2]:
_or = []
else:
_or = expr[1]
return expr[0], _or, expr[2]

if __name__ == "__main__":
print "keep_blank_values=0"
print """/cool/666/42 - ['evilid', 'goodid']"""
d = unpack("/cool/666/42", ['evilid', 'goodid'])
for k,v in d.items(): print k, v

print """/cool/666/42 - ['evilid']"""
d = unpack("/cool/666/42", ['evilid'])
for k,v in d.items(): print k, v

print """/cool/666 - ['evilid', 'goodid']"""
d = unpack("/cool/666", ['evilid', 'goodid'])
for k,v in d.items(): print k, v

print "\nkeep_blank_values=1"
print """/cool/666/42 - ['evilid', 'goodid']"""
d = unpack("/cool/666/42", ['evilid', 'goodid'], 1)
for k,v in d.items(): print k, v

print """/cool/666/42 - ['evilid']"""
d = unpack("/cool/666/42", ['evilid'], 1)
for k,v in d.items(): print k, v

print """/cool/666 - ['evilid', 'goodid']"""
d = unpack("/cool/666", ['evilid', 'goodid'], 1)
for k,v in d.items(): print k, v

No comments: