| 1 |
14 |
art |
#!/usr/bin/env python
|
| 2 |
|
|
|
| 3 |
|
|
# forgottenislanderbot - makes a DSL coverage map from Bell Aliant's website.
|
| 4 |
|
|
# Copyright (c) 2010 Art Ortenburger
|
| 5 |
|
|
#
|
| 6 |
|
|
# This program is free software; you can redistribute it and/or
|
| 7 |
|
|
# modify it under the terms of the GNU General Public License
|
| 8 |
|
|
# as published by the Free Software Foundation; either version 2
|
| 9 |
|
|
# of the License, or any later version.
|
| 10 |
|
|
#
|
| 11 |
|
|
# This program is distributed in the hope that it will be useful,
|
| 12 |
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 13 |
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
| 14 |
|
|
# GNU General Public License for more details.
|
| 15 |
|
|
#
|
| 16 |
|
|
# You should have received a copy of the GNU General Public License
|
| 17 |
|
|
# along with this program; if not, write to the Free Software
|
| 18 |
|
|
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
| 19 |
|
|
#
|
| 20 |
|
|
|
| 21 |
|
|
# fib-crawlbot.py
|
| 22 |
|
|
# by Art Ortenburger IV <utrrrongeeb a users , sf , net>
|
| 23 |
|
|
# written 2010-01-13 -
|
| 24 |
|
|
#
|
| 25 |
|
|
# forgottenislanderbot -- crawler bot
|
| 26 |
|
|
#
|
| 27 |
|
|
# Checks street addresses for DSL availability
|
| 28 |
|
|
# on Bell Aliant's website.
|
| 29 |
|
|
# Loads addresses from SQlite database,
|
| 30 |
|
|
# selects a sparse but regular subset of addresses,
|
| 31 |
|
|
# and automatically shoves them through Bell Aliant's
|
| 32 |
|
|
# website at the designated rate, parsing the pages
|
| 33 |
|
|
# and updating the SQlite database.
|
| 34 |
|
|
#
|
| 35 |
|
|
|
| 36 |
|
|
|
| 37 |
|
|
import sys
|
| 38 |
|
|
import sqlite3
|
| 39 |
|
|
import robotparser
|
| 40 |
|
|
import fibmod
|
| 41 |
|
|
import httplib
|
| 42 |
|
|
import urllib
|
| 43 |
|
|
import time
|
| 44 |
|
|
import random
|
| 45 |
|
|
|
| 46 |
|
|
print sys.argv[0],"""(forgottenislanderbot crawler bot) Version 0.1
|
| 47 |
|
|
Copyright (c) 2010 Art Ortenburger. Licensed under the GNU GPL; no warranty."""
|
| 48 |
|
|
|
| 49 |
|
|
# default bot useragent. If necessary, uncomment alternative below.
|
| 50 |
|
|
bot_useragent = "forgottenislanderbot"
|
| 51 |
|
|
# bot_useragent = """Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.1.2) Gecko/20090729 Firefox/3.5.2 (.NET CLR 3.5.30729)"""
|
| 52 |
|
|
|
| 53 |
|
|
# hostname of Bell Aliant server
|
| 54 |
|
|
ba_host = "productsandservice.bellaliant.net"
|
| 55 |
|
|
# file which so conveniently does a whole address check with one request
|
| 56 |
|
|
ba_handler = "/PS/pe/english/productsandservices/qualificationFullAddressCheck.do"
|
| 57 |
|
|
|
| 58 |
|
|
# HTTP 1.1 headers to custom-set when looking up addresses
|
| 59 |
|
|
# tried a limited-byterange request, didn't work. Next try gzip.
|
| 60 |
|
|
headers = {"User-Agent": bot_useragent,
|
| 61 |
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
| 62 |
|
|
"Accept-Language": "en-us,en;q=0.5",
|
| 63 |
|
|
# "Accept-Encoding": "gzip,deflate",
|
| 64 |
|
|
"Accept-Charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.7",
|
| 65 |
|
|
# "Range": "bytes=15335-17374",# experimental; didn't work
|
| 66 |
|
|
"Keep-Alive": 3000,
|
| 67 |
|
|
"Connection": "keep-alive",
|
| 68 |
|
|
"Referer": "http://productsandservice.bellaliant.net/PS/pe/english/productsandservices/qualificationApartmentCheck.do",
|
| 69 |
|
|
# "Cookie": "region=pe; language=en; ALIANTCON=7FmtLNjpM8NpwNfnKtt1nhnVQ30dGLyZGyZVvSjz311BvbJNbrhC!563379176!642852716; es_vis_ck=1263379648515_7410; es_session_ck=1263379648515_7588; WT_FPC=id=248ff551ebe935daaef1263379651212:lv=1263379867207:ss=1263379651212; AliantWebSurvey_Invited=Invited",
|
| 70 |
|
|
"Content-Type": "application/x-www-form-urlencoded"}
|
| 71 |
|
|
|
| 72 |
|
|
# these are to make a 'long' delay every now and then, to
|
| 73 |
|
|
# a) commit the database, and
|
| 74 |
|
|
# b) allow more time for a SIGINT.
|
| 75 |
|
|
safetypausecount = 0 # current index in a block
|
| 76 |
|
|
safetypauseblock = 20 # number of full requests between pauses. Make larger?
|
| 77 |
|
|
safetypause = 3 # seconds to pause. Can be a float.
|
| 78 |
|
|
|
| 79 |
|
|
def checkAddress(sqlcon,town,road,civic,delay,status,lon,lat):
|
| 80 |
|
|
""" Checks a single full civic address' DSL status on the Bell Aliant website.
|
| 81 |
|
|
sqlcon: sqlite3.Cursor for FIB database
|
| 82 |
|
|
town: town of address
|
| 83 |
|
|
road: street of address
|
| 84 |
|
|
civic: int; civic number for address
|
| 85 |
|
|
delay: float; seconds to wait after request
|
| 86 |
|
|
status: DSL status of address. Used to skip.
|
| 87 |
|
|
lon: float; longitude of address
|
| 88 |
|
|
lat: float; latitude of address"""
|
| 89 |
|
|
|
| 90 |
|
|
# print a progress line for each address
|
| 91 |
|
|
print "checking ", civic, road, ",", town, ":",
|
| 92 |
|
|
sys.stdout.flush() # fix for progress line on Linux
|
| 93 |
|
|
|
| 94 |
|
|
## # avoid re-checking items
|
| 95 |
|
|
## if status > 0: # reserve -1 for errors, etc
|
| 96 |
|
|
## print status," [ skipping ]"
|
| 97 |
|
|
## sys.stdout.flush()
|
| 98 |
|
|
## return
|
| 99 |
|
|
|
| 100 |
|
|
try:
|
| 101 |
|
|
# POST form value params
|
| 102 |
|
|
# warning: using urllib mixes up the original header order
|
| 103 |
|
|
params = urllib.urlencode({"province_code_id":"pe",
|
| 104 |
|
|
"product_id":"2281",
|
| 105 |
|
|
"streetName":road,
|
| 106 |
|
|
"product_id":"2281",
|
| 107 |
|
|
"communityName":town,
|
| 108 |
|
|
"curbody":"51",
|
| 109 |
|
|
"bodycont":"null",
|
| 110 |
|
|
"section":"51",
|
| 111 |
|
|
"subsection":"0",
|
| 112 |
|
|
"community_id":"null",
|
| 113 |
|
|
"apartment":"",
|
| 114 |
|
|
"streetNumber":civic,
|
| 115 |
|
|
"streetNumberSuffix":""})
|
| 116 |
|
|
hc.connect() # seems to be needed, over dial-up, at least
|
| 117 |
|
|
hc.request("POST",ba_handler,params,headers)
|
| 118 |
|
|
print ".",
|
| 119 |
|
|
sys.stdout.flush()
|
| 120 |
|
|
hr = hc.getresponse()
|
| 121 |
|
|
if hr.status != httplib.OK:
|
| 122 |
|
|
print ">",hr.status,
|
| 123 |
|
|
pass # maybe set an error flag?
|
| 124 |
|
|
print ".",
|
| 125 |
|
|
sys.stdout.flush()
|
| 126 |
|
|
hrd = hr.read()
|
| 127 |
|
|
print ".",
|
| 128 |
|
|
sys.stdout.flush()
|
| 129 |
|
|
# print hrd
|
| 130 |
|
|
if hrd.find("<b>We're Sorry!</b>") != -1: # no dsl
|
| 131 |
|
|
sqlcon.execute("update dsl set dsl=1 where civic=? and road=? and town=?",
|
| 132 |
|
|
[civic,road,town])
|
| 133 |
|
|
print ": 1;",
|
| 134 |
|
|
sys.stdout.flush()
|
| 135 |
|
|
elif hrd.find("Ultra") != -1: # refine this
|
| 136 |
|
|
sqlcon.execute("update dsl set dsl=3 where civic=? and road=? and town=?",
|
| 137 |
|
|
[civic,road,town])
|
| 138 |
|
|
print ": 3;",
|
| 139 |
|
|
sys.stdout.flush()
|
| 140 |
|
|
elif hrd.find("<td>Congratulations! You can choose") != -1: # refine this
|
| 141 |
|
|
sqlcon.execute("update dsl set dsl=2 where civic=? and road=? and town=?",
|
| 142 |
|
|
[civic,road,town])
|
| 143 |
|
|
print ": 2;",
|
| 144 |
|
|
sys.stdout.flush()
|
| 145 |
|
|
else: # error
|
| 146 |
|
|
sqlcon.execute("update dsl set dsl=-1 where civic=? and road=? and town=?",
|
| 147 |
|
|
[civic,road,town])
|
| 148 |
|
|
print ": !! unparseable !!",
|
| 149 |
|
|
sys.stdout.flush()
|
| 150 |
|
|
print "ok...",
|
| 151 |
|
|
sys.stdout.flush()
|
| 152 |
|
|
except KeyboardInterrupt:
|
| 153 |
|
|
print
|
| 154 |
|
|
print "SIGINT received, exiting"
|
| 155 |
|
|
sys.stdout.flush()
|
| 156 |
|
|
safeexit()
|
| 157 |
|
|
except httplib.HTTPException, httperr:
|
| 158 |
|
|
try:
|
| 159 |
|
|
sqlcon.execute("update dsl set dsl=-1 where civic=? and road=? and town=?",
|
| 160 |
|
|
[civic,road,town])
|
| 161 |
|
|
except:
|
| 162 |
|
|
pass
|
| 163 |
|
|
print "!! error !!:"
|
| 164 |
|
|
print httperr
|
| 165 |
|
|
print sys.exc_info()[0]
|
| 166 |
|
|
print ":continuing:"
|
| 167 |
|
|
sys.stdout.flush()
|
| 168 |
|
|
except Exception, errdet: # IO errors, net errors, etc.
|
| 169 |
|
|
try:
|
| 170 |
|
|
sqlcon.execute("update dsl set dsl=-1 where civic=? and road=? and town=?",
|
| 171 |
|
|
[civic,road,town])
|
| 172 |
|
|
except:
|
| 173 |
|
|
pass
|
| 174 |
|
|
print "!! error !!:"
|
| 175 |
|
|
print errdet
|
| 176 |
|
|
print sys.exc_info()[0]
|
| 177 |
|
|
print ":continuing:"
|
| 178 |
|
|
sys.stdout.flush()
|
| 179 |
|
|
finally:
|
| 180 |
|
|
try:
|
| 181 |
|
|
# sleep to reduce server load / pretend to not be a bot
|
| 182 |
|
|
time.sleep((delay*(0.75+random.random()*0.5))) # semi-randomized time
|
| 183 |
|
|
# if needed, kill the script with SIGINT during the sleep.
|
| 184 |
|
|
except KeyboardInterrupt:
|
| 185 |
|
|
safeexit()
|
| 186 |
|
|
except: # can't have _anything_ stop crawler accidentally
|
| 187 |
|
|
pass
|
| 188 |
|
|
print "next"
|
| 189 |
|
|
sys.stdout.flush()
|
| 190 |
|
|
|
| 191 |
|
|
# safety pause, to save the database and allow for Ctrl-C killing:
|
| 192 |
|
|
try:
|
| 193 |
|
|
globals()['safetypausecount'] = safetypausecount + 1
|
| 194 |
|
|
if safetypausecount >= safetypauseblock:
|
| 195 |
|
|
globals()['safetypausecount'] = 0
|
| 196 |
|
|
print "routine ",safetypause,"s delay to save the database and ease exiting:",
|
| 197 |
|
|
sys.stdout.flush()
|
| 198 |
|
|
qdbCon.commit()
|
| 199 |
|
|
print "committed, sleeping...",
|
| 200 |
|
|
sys.stdout.flush()
|
| 201 |
|
|
time.sleep(safetypause)
|
| 202 |
|
|
print "continuing."
|
| 203 |
|
|
sys.stdout.flush()
|
| 204 |
|
|
except KeyboardInterrupt:
|
| 205 |
|
|
print "exiting:"
|
| 206 |
|
|
safeexit()
|
| 207 |
|
|
except Exception, errdet: # anything else
|
| 208 |
|
|
print "!! error during pause !! :",errdet
|
| 209 |
|
|
pass
|
| 210 |
|
|
|
| 211 |
|
|
|
| 212 |
|
|
def checkRobotsTxt():
|
| 213 |
|
|
try:
|
| 214 |
|
|
rfp = robotparser.RobotFileParser()
|
| 215 |
|
|
rfp.set_url("http://"+ba_host+"/robots.txt")
|
| 216 |
|
|
rfp.read()
|
| 217 |
|
|
# logic below is a mess
|
| 218 |
|
|
# Does RobotFileParser automatically handle wildcards?
|
| 219 |
|
|
# This is sort of a token gesture, though.
|
| 220 |
|
|
if not (rfp.can_fetch("*","http://"+ba_host+ba_handler) or
|
| 221 |
|
|
rfp.can_fetch(bot_useragent,"http://"+ba_host+ba_handler)):
|
| 222 |
|
|
print """
|
| 223 |
|
|
********************************************
|
| 224 |
|
|
********************************************
|
| 225 |
|
|
*** ***
|
| 226 |
|
|
*** A robots.txt file is denying access. ***
|
| 227 |
|
|
*** ***
|
| 228 |
|
|
*** Override? (y/*) ***"""
|
| 229 |
|
|
ora = raw_input("*** ----->")
|
| 230 |
|
|
if not ora == "y":
|
| 231 |
|
|
print "****** Exiting: denied access due to robots.txt"
|
| 232 |
|
|
exit(0)
|
| 233 |
|
|
else:
|
| 234 |
|
|
print "****** Continuing: robots.txt overridden"
|
| 235 |
|
|
except KeyboardInterrupt:
|
| 236 |
|
|
print "caught SIGINT, exiting"
|
| 237 |
|
|
safeexit()
|
| 238 |
|
|
except: # sloppy, I know.
|
| 239 |
|
|
print "Couldn't fetch robots.txt; continuing"
|
| 240 |
|
|
# doesn't care if it doesn't load
|
| 241 |
|
|
|
| 242 |
|
|
|
| 243 |
|
|
|
| 244 |
|
|
def safeexit():
|
| 245 |
|
|
""" stop crawling, hopefully saving all records
|
| 246 |
|
|
and closing all streams."""
|
| 247 |
|
|
try: # don't want an individual failure to prevent qdbCon.commit()
|
| 248 |
|
|
hc.close()
|
| 249 |
|
|
except:
|
| 250 |
|
|
pass
|
| 251 |
|
|
try:
|
| 252 |
|
|
qdbC.close()
|
| 253 |
|
|
except:
|
| 254 |
|
|
pass
|
| 255 |
|
|
try:
|
| 256 |
|
|
qdbCon.commit()
|
| 257 |
|
|
qdbCon.close()
|
| 258 |
|
|
except:
|
| 259 |
|
|
pass
|
| 260 |
|
|
exit(0)
|
| 261 |
|
|
|
| 262 |
|
|
|
| 263 |
|
|
# start the bot. First confirm args are present
|
| 264 |
|
|
if len(sys.argv)<4:
|
| 265 |
|
|
print """
|
| 266 |
|
|
Usage: fib-crawlerbot.py (database.sqlite) (sparsity) (delay) (exclude)
|
| 267 |
|
|
|
| 268 |
|
|
(sparsity) minimum spacing between civic numbers to check
|
| 269 |
|
|
(delay) seconds between requests (randomized +/- 25%)
|
| 270 |
|
|
(exclude) whether to exclude the two cities (y/n)
|
| 271 |
|
|
|
| 272 |
|
|
"""
|
| 273 |
|
|
exit(0)
|
| 274 |
|
|
|
| 275 |
|
|
|
| 276 |
|
|
qdbCon = sqlite3.connect(sys.argv[1]) # SQlite database connection
|
| 277 |
|
|
qdbC = qdbCon.cursor() # SQlite database cursor
|
| 278 |
|
|
|
| 279 |
|
|
checkRobotsTxt()
|
| 280 |
|
|
# start crawling:
|
| 281 |
|
|
hc = httplib.HTTPConnection(ba_host)
|
| 282 |
|
|
# use the shared module's processDB for parsing the database
|
| 283 |
|
|
# bool list is the dslflags param -- by default, only get unchecked houses
|
| 284 |
|
|
# may want to change this with new arg? and maxrows is False, for
|
| 285 |
|
|
# maximum sparseness
|
| 286 |
|
|
fibmod.processDB(qdbC,checkAddress,[True,False,False,False],False,int(sys.argv[2]),float(sys.argv[3]),fibmod.char2Bool(sys.argv[4]))
|
| 287 |
|
|
|
| 288 |
|
|
# done crawling, exit safely
|
| 289 |
|
|
safeexit()
|
| 290 |
|
|
|
| 291 |
|
|
|
| 292 |
|
|
|
| 293 |
|
|
# SQlite format is
|
| 294 |
|
|
# [civic] [road] [town] [lat] [long] [dsl]
|
| 295 |
|
|
# dsl options: [ 0: unknown, 1: no, 2: basic, 3: ultra ]
|