Subversion Repositories Misc

[/] [dslmap/] [forgottenislanderbot/] [fib-crawlbot.py] - Rev 14

Compare with Previous | Blame | View Log

#!/usr/bin/env python
 
# forgottenislanderbot - makes a DSL coverage map from Bell Aliant's website.
# Copyright (c) 2010 Art Ortenburger
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
#
 
# fib-crawlbot.py
# by Art Ortenburger IV <utrrrongeeb a users , sf , net>
# written 2010-01-13 -
#
# forgottenislanderbot -- crawler bot
#
# Checks street addresses for DSL availability
# on Bell Aliant's website.
# Loads addresses from SQlite database,
# selects a sparse but regular subset of addresses,
# and automatically shoves them through Bell Aliant's
# website at the designated rate, parsing the pages
# and updating the SQlite database.
#
 
 
import sys
import sqlite3
import robotparser
import fibmod
import httplib
import urllib
import time
import random
 
print sys.argv[0],"""(forgottenislanderbot crawler bot)    Version 0.1
Copyright (c) 2010 Art Ortenburger. Licensed under the GNU GPL; no warranty."""
 
# default bot useragent. If necessary, uncomment alternative below.
bot_useragent = "forgottenislanderbot"
# bot_useragent = """Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.1.2) Gecko/20090729 Firefox/3.5.2 (.NET CLR 3.5.30729)"""
 
# hostname of Bell Aliant server
ba_host = "productsandservice.bellaliant.net"
# file which so conveniently does a whole address check with one request
ba_handler = "/PS/pe/english/productsandservices/qualificationFullAddressCheck.do"
 
# HTTP 1.1 headers to custom-set when looking up addresses
# tried a limited-byterange request, didn't work. Next try gzip.
headers = {"User-Agent": bot_useragent,
           "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
           "Accept-Language": "en-us,en;q=0.5",
#           "Accept-Encoding": "gzip,deflate",
           "Accept-Charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.7",
#           "Range": "bytes=15335-17374",# experimental; didn't work
           "Keep-Alive": 3000,
           "Connection": "keep-alive",
           "Referer": "http://productsandservice.bellaliant.net/PS/pe/english/productsandservices/qualificationApartmentCheck.do",
#           "Cookie": "region=pe; language=en; ALIANTCON=7FmtLNjpM8NpwNfnKtt1nhnVQ30dGLyZGyZVvSjz311BvbJNbrhC!563379176!642852716; es_vis_ck=1263379648515_7410; es_session_ck=1263379648515_7588; WT_FPC=id=248ff551ebe935daaef1263379651212:lv=1263379867207:ss=1263379651212; AliantWebSurvey_Invited=Invited",
           "Content-Type": "application/x-www-form-urlencoded"}
 
# these are to make a 'long' delay every now and then, to
# a) commit the database, and
# b) allow more time for a SIGINT.
safetypausecount = 0 # current index in a block
safetypauseblock = 20 # number of full requests between pauses. Make larger?
safetypause = 3 # seconds to pause. Can be a float.
 
def checkAddress(sqlcon,town,road,civic,delay,status,lon,lat):
    """ Checks a single full civic address' DSL status on the Bell Aliant website.
    sqlcon: sqlite3.Cursor for FIB database
    town:   town of address
    road:   street of address
    civic:  int; civic number for address
    delay:  float; seconds to wait after request
    status: DSL status of address. Used to skip.
    lon:    float; longitude of address
    lat:    float; latitude of address"""
 
    # print a progress line for each address
    print "checking ", civic, road, ",", town, ":",
    sys.stdout.flush() # fix for progress line on Linux
 
##    # avoid re-checking items
##    if status > 0: # reserve -1 for errors, etc
##        print status," [ skipping ]"
##        sys.stdout.flush()
##        return
 
    try:
        # POST form value params
        # warning: using urllib mixes up the original header order
        params = urllib.urlencode({"province_code_id":"pe",
                                   "product_id":"2281",
                                   "streetName":road,
                                   "product_id":"2281",
                                   "communityName":town,
                                   "curbody":"51",
                                   "bodycont":"null",
                                   "section":"51",
                                   "subsection":"0",
                                   "community_id":"null",
                                   "apartment":"",
                                   "streetNumber":civic,
                                   "streetNumberSuffix":""})
        hc.connect() # seems to be needed, over dial-up, at least
        hc.request("POST",ba_handler,params,headers)
        print ".",
        sys.stdout.flush()
        hr = hc.getresponse()
        if hr.status != httplib.OK:
            print ">",hr.status,
            pass # maybe set an error flag?
        print ".",
        sys.stdout.flush()
        hrd = hr.read()
        print ".",
        sys.stdout.flush()
       # print hrd
        if hrd.find("<b>We're Sorry!</b>") != -1: # no dsl
            sqlcon.execute("update dsl set dsl=1 where civic=? and road=? and town=?",
                           [civic,road,town])
            print ": 1;",
            sys.stdout.flush()
        elif hrd.find("Ultra") != -1: # refine this
            sqlcon.execute("update dsl set dsl=3 where civic=? and road=? and town=?",
                           [civic,road,town])
            print ": 3;",
            sys.stdout.flush()
        elif hrd.find("<td>Congratulations! You can choose") != -1: # refine this
            sqlcon.execute("update dsl set dsl=2 where civic=? and road=? and town=?",
                           [civic,road,town])
            print ": 2;",
            sys.stdout.flush()
        else: # error
            sqlcon.execute("update dsl set dsl=-1 where civic=? and road=? and town=?",
                           [civic,road,town])
            print ": !! unparseable !!",
            sys.stdout.flush()
        print "ok...",
        sys.stdout.flush()
    except KeyboardInterrupt:
        print
        print "SIGINT received, exiting"
        sys.stdout.flush()
        safeexit()
    except httplib.HTTPException, httperr:
        try:
            sqlcon.execute("update dsl set dsl=-1 where civic=? and road=? and town=?",
                           [civic,road,town])
        except:
            pass
        print "!! error !!:"
        print httperr
        print sys.exc_info()[0]
        print ":continuing:"
        sys.stdout.flush()
    except Exception, errdet: # IO errors, net errors, etc.
        try:
            sqlcon.execute("update dsl set dsl=-1 where civic=? and road=? and town=?",
                           [civic,road,town])
        except:
            pass
        print "!! error !!:"
        print errdet
        print sys.exc_info()[0]
        print ":continuing:"
        sys.stdout.flush()
    finally:
        try:
            # sleep to reduce server load / pretend to not be a bot
            time.sleep((delay*(0.75+random.random()*0.5))) # semi-randomized time
            # if needed, kill the script with SIGINT during the sleep.
        except KeyboardInterrupt:
            safeexit()
        except: # can't have _anything_ stop crawler accidentally
            pass
    print "next"
    sys.stdout.flush()
 
    # safety pause, to save the database and allow for Ctrl-C killing:
    try:
        globals()['safetypausecount'] = safetypausecount + 1
        if safetypausecount >= safetypauseblock:
            globals()['safetypausecount'] = 0
            print "routine ",safetypause,"s delay to save the database and ease exiting:",
            sys.stdout.flush()
            qdbCon.commit()
            print "committed, sleeping...",
            sys.stdout.flush()
            time.sleep(safetypause)
            print "continuing."
            sys.stdout.flush()
    except KeyboardInterrupt:
        print "exiting:"
        safeexit()
    except Exception, errdet: # anything else
        print "!! error during pause !! :",errdet
        pass
 
 
def checkRobotsTxt():
    try:
        rfp = robotparser.RobotFileParser()
        rfp.set_url("http://"+ba_host+"/robots.txt")
        rfp.read()
        # logic below is a mess
        # Does RobotFileParser automatically handle wildcards?
        # This is sort of a token gesture, though.
        if not (rfp.can_fetch("*","http://"+ba_host+ba_handler) or
            rfp.can_fetch(bot_useragent,"http://"+ba_host+ba_handler)):
            print """
********************************************
********************************************
***                                      ***
*** A robots.txt file is denying access. ***
***                                      ***
*** Override? (y/*)                      ***"""
            ora = raw_input("*** ----->")
            if not ora == "y":
                print "****** Exiting: denied access due to robots.txt"
                exit(0)
            else:
                print "****** Continuing: robots.txt overridden"
    except KeyboardInterrupt:
        print "caught SIGINT, exiting"
        safeexit()
    except: # sloppy, I know. 
        print "Couldn't fetch robots.txt; continuing"
        # doesn't care if it doesn't load
 
 
 
def safeexit():
    """ stop crawling, hopefully saving all records
    and closing all streams."""
    try: # don't want an individual failure to prevent qdbCon.commit()
        hc.close()
    except:
        pass
    try:
        qdbC.close()
    except:
        pass
    try:
        qdbCon.commit()
        qdbCon.close()
    except:
        pass
    exit(0)
 
 
# start the bot. First confirm args are present
if len(sys.argv)<4:
    print """
Usage:  fib-crawlerbot.py (database.sqlite) (sparsity) (delay) (exclude)
 
         (sparsity)    minimum spacing between civic numbers to check
         (delay)       seconds between requests (randomized +/- 25%)
         (exclude)     whether to exclude the two cities (y/n)
 
"""
    exit(0)
 
 
qdbCon = sqlite3.connect(sys.argv[1]) # SQlite database connection
qdbC = qdbCon.cursor() # SQlite database cursor
 
checkRobotsTxt()
# start crawling:
hc = httplib.HTTPConnection(ba_host)
# use the shared module's processDB for parsing the database
# bool list is the dslflags param -- by default, only get unchecked houses
# may want to change this with new arg? and maxrows is False, for
# maximum sparseness
fibmod.processDB(qdbC,checkAddress,[True,False,False,False],False,int(sys.argv[2]),float(sys.argv[3]),fibmod.char2Bool(sys.argv[4]))
 
# done crawling, exit safely
safeexit()
 
 
 
# SQlite format is
# [civic]   [road]    [town]  [lat]   [long]   [dsl]
# dsl options: [ 0: unknown, 1: no, 2: basic, 3: ultra ]
 

Compare with Previous | Blame | View Log