Compare with Previous | Blame | View Log
#!/usr/bin/env python # forgottenislanderbot - makes a DSL coverage map from Bell Aliant's website. # Copyright (c) 2010 Art Ortenburger # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # # fib-crawlbot.py # by Art Ortenburger IV <utrrrongeeb a users , sf , net> # written 2010-01-13 - # # forgottenislanderbot -- crawler bot # # Checks street addresses for DSL availability # on Bell Aliant's website. # Loads addresses from SQlite database, # selects a sparse but regular subset of addresses, # and automatically shoves them through Bell Aliant's # website at the designated rate, parsing the pages # and updating the SQlite database. # import sys import sqlite3 import robotparser import fibmod import httplib import urllib import time import random print sys.argv[0],"""(forgottenislanderbot crawler bot) Version 0.1 Copyright (c) 2010 Art Ortenburger. Licensed under the GNU GPL; no warranty.""" # default bot useragent. If necessary, uncomment alternative below. bot_useragent = "forgottenislanderbot" # bot_useragent = """Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.1.2) Gecko/20090729 Firefox/3.5.2 (.NET CLR 3.5.30729)""" # hostname of Bell Aliant server ba_host = "productsandservice.bellaliant.net" # file which so conveniently does a whole address check with one request ba_handler = "/PS/pe/english/productsandservices/qualificationFullAddressCheck.do" # HTTP 1.1 headers to custom-set when looking up addresses # tried a limited-byterange request, didn't work. Next try gzip. headers = {"User-Agent": bot_useragent, "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-us,en;q=0.5", # "Accept-Encoding": "gzip,deflate", "Accept-Charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.7", # "Range": "bytes=15335-17374",# experimental; didn't work "Keep-Alive": 3000, "Connection": "keep-alive", "Referer": "http://productsandservice.bellaliant.net/PS/pe/english/productsandservices/qualificationApartmentCheck.do", # "Cookie": "region=pe; language=en; ALIANTCON=7FmtLNjpM8NpwNfnKtt1nhnVQ30dGLyZGyZVvSjz311BvbJNbrhC!563379176!642852716; es_vis_ck=1263379648515_7410; es_session_ck=1263379648515_7588; WT_FPC=id=248ff551ebe935daaef1263379651212:lv=1263379867207:ss=1263379651212; AliantWebSurvey_Invited=Invited", "Content-Type": "application/x-www-form-urlencoded"} # these are to make a 'long' delay every now and then, to # a) commit the database, and # b) allow more time for a SIGINT. safetypausecount = 0 # current index in a block safetypauseblock = 20 # number of full requests between pauses. Make larger? safetypause = 3 # seconds to pause. Can be a float. def checkAddress(sqlcon,town,road,civic,delay,status,lon,lat): """ Checks a single full civic address' DSL status on the Bell Aliant website. sqlcon: sqlite3.Cursor for FIB database town: town of address road: street of address civic: int; civic number for address delay: float; seconds to wait after request status: DSL status of address. Used to skip. lon: float; longitude of address lat: float; latitude of address""" # print a progress line for each address print "checking ", civic, road, ",", town, ":", sys.stdout.flush() # fix for progress line on Linux ## # avoid re-checking items ## if status > 0: # reserve -1 for errors, etc ## print status," [ skipping ]" ## sys.stdout.flush() ## return try: # POST form value params # warning: using urllib mixes up the original header order params = urllib.urlencode({"province_code_id":"pe", "product_id":"2281", "streetName":road, "product_id":"2281", "communityName":town, "curbody":"51", "bodycont":"null", "section":"51", "subsection":"0", "community_id":"null", "apartment":"", "streetNumber":civic, "streetNumberSuffix":""}) hc.connect() # seems to be needed, over dial-up, at least hc.request("POST",ba_handler,params,headers) print ".", sys.stdout.flush() hr = hc.getresponse() if hr.status != httplib.OK: print ">",hr.status, pass # maybe set an error flag? print ".", sys.stdout.flush() hrd = hr.read() print ".", sys.stdout.flush() # print hrd if hrd.find("<b>We're Sorry!</b>") != -1: # no dsl sqlcon.execute("update dsl set dsl=1 where civic=? and road=? and town=?", [civic,road,town]) print ": 1;", sys.stdout.flush() elif hrd.find("Ultra") != -1: # refine this sqlcon.execute("update dsl set dsl=3 where civic=? and road=? and town=?", [civic,road,town]) print ": 3;", sys.stdout.flush() elif hrd.find("<td>Congratulations! You can choose") != -1: # refine this sqlcon.execute("update dsl set dsl=2 where civic=? and road=? and town=?", [civic,road,town]) print ": 2;", sys.stdout.flush() else: # error sqlcon.execute("update dsl set dsl=-1 where civic=? and road=? and town=?", [civic,road,town]) print ": !! unparseable !!", sys.stdout.flush() print "ok...", sys.stdout.flush() except KeyboardInterrupt: print print "SIGINT received, exiting" sys.stdout.flush() safeexit() except httplib.HTTPException, httperr: try: sqlcon.execute("update dsl set dsl=-1 where civic=? and road=? and town=?", [civic,road,town]) except: pass print "!! error !!:" print httperr print sys.exc_info()[0] print ":continuing:" sys.stdout.flush() except Exception, errdet: # IO errors, net errors, etc. try: sqlcon.execute("update dsl set dsl=-1 where civic=? and road=? and town=?", [civic,road,town]) except: pass print "!! error !!:" print errdet print sys.exc_info()[0] print ":continuing:" sys.stdout.flush() finally: try: # sleep to reduce server load / pretend to not be a bot time.sleep((delay*(0.75+random.random()*0.5))) # semi-randomized time # if needed, kill the script with SIGINT during the sleep. except KeyboardInterrupt: safeexit() except: # can't have _anything_ stop crawler accidentally pass print "next" sys.stdout.flush() # safety pause, to save the database and allow for Ctrl-C killing: try: globals()['safetypausecount'] = safetypausecount + 1 if safetypausecount >= safetypauseblock: globals()['safetypausecount'] = 0 print "routine ",safetypause,"s delay to save the database and ease exiting:", sys.stdout.flush() qdbCon.commit() print "committed, sleeping...", sys.stdout.flush() time.sleep(safetypause) print "continuing." sys.stdout.flush() except KeyboardInterrupt: print "exiting:" safeexit() except Exception, errdet: # anything else print "!! error during pause !! :",errdet pass def checkRobotsTxt(): try: rfp = robotparser.RobotFileParser() rfp.set_url("http://"+ba_host+"/robots.txt") rfp.read() # logic below is a mess # Does RobotFileParser automatically handle wildcards? # This is sort of a token gesture, though. if not (rfp.can_fetch("*","http://"+ba_host+ba_handler) or rfp.can_fetch(bot_useragent,"http://"+ba_host+ba_handler)): print """ ******************************************** ******************************************** *** *** *** A robots.txt file is denying access. *** *** *** *** Override? (y/*) ***""" ora = raw_input("*** ----->") if not ora == "y": print "****** Exiting: denied access due to robots.txt" exit(0) else: print "****** Continuing: robots.txt overridden" except KeyboardInterrupt: print "caught SIGINT, exiting" safeexit() except: # sloppy, I know. print "Couldn't fetch robots.txt; continuing" # doesn't care if it doesn't load def safeexit(): """ stop crawling, hopefully saving all records and closing all streams.""" try: # don't want an individual failure to prevent qdbCon.commit() hc.close() except: pass try: qdbC.close() except: pass try: qdbCon.commit() qdbCon.close() except: pass exit(0) # start the bot. First confirm args are present if len(sys.argv)<4: print """ Usage: fib-crawlerbot.py (database.sqlite) (sparsity) (delay) (exclude) (sparsity) minimum spacing between civic numbers to check (delay) seconds between requests (randomized +/- 25%) (exclude) whether to exclude the two cities (y/n) """ exit(0) qdbCon = sqlite3.connect(sys.argv[1]) # SQlite database connection qdbC = qdbCon.cursor() # SQlite database cursor checkRobotsTxt() # start crawling: hc = httplib.HTTPConnection(ba_host) # use the shared module's processDB for parsing the database # bool list is the dslflags param -- by default, only get unchecked houses # may want to change this with new arg? and maxrows is False, for # maximum sparseness fibmod.processDB(qdbC,checkAddress,[True,False,False,False],False,int(sys.argv[2]),float(sys.argv[3]),fibmod.char2Bool(sys.argv[4])) # done crawling, exit safely safeexit() # SQlite format is # [civic] [road] [town] [lat] [long] [dsl] # dsl options: [ 0: unknown, 1: no, 2: basic, 3: ultra ]