Subversion Repositories Misc

[/] [dslmap/] [forgottenislanderbot/] [fib-crawlbot.py] - Blame information for rev 14

Details | Compare with Previous | View Log

Line No. Rev Author Line
1 14 art
#!/usr/bin/env python
2
 
3
# forgottenislanderbot - makes a DSL coverage map from Bell Aliant's website.
4
# Copyright (c) 2010 Art Ortenburger
5
#
6
# This program is free software; you can redistribute it and/or
7
# modify it under the terms of the GNU General Public License
8
# as published by the Free Software Foundation; either version 2
9
# of the License, or any later version.
10
#
11
# This program is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with this program; if not, write to the Free Software
18
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
19
#
20
 
21
# fib-crawlbot.py
22
# by Art Ortenburger IV <utrrrongeeb a users , sf , net>
23
# written 2010-01-13 -
24
#
25
# forgottenislanderbot -- crawler bot
26
#
27
# Checks street addresses for DSL availability
28
# on Bell Aliant's website.
29
# Loads addresses from SQlite database,
30
# selects a sparse but regular subset of addresses,
31
# and automatically shoves them through Bell Aliant's
32
# website at the designated rate, parsing the pages
33
# and updating the SQlite database.
34
#
35
 
36
 
37
import sys
38
import sqlite3
39
import robotparser
40
import fibmod
41
import httplib
42
import urllib
43
import time
44
import random
45
 
46
print sys.argv[0],"""(forgottenislanderbot crawler bot)    Version 0.1
47
Copyright (c) 2010 Art Ortenburger. Licensed under the GNU GPL; no warranty."""
48
 
49
# default bot useragent. If necessary, uncomment alternative below.
50
bot_useragent = "forgottenislanderbot"
51
# bot_useragent = """Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.1.2) Gecko/20090729 Firefox/3.5.2 (.NET CLR 3.5.30729)"""
52
 
53
# hostname of Bell Aliant server
54
ba_host = "productsandservice.bellaliant.net"
55
# file which so conveniently does a whole address check with one request
56
ba_handler = "/PS/pe/english/productsandservices/qualificationFullAddressCheck.do"
57
 
58
# HTTP 1.1 headers to custom-set when looking up addresses
59
# tried a limited-byterange request, didn't work. Next try gzip.
60
headers = {"User-Agent": bot_useragent,
61
           "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
62
           "Accept-Language": "en-us,en;q=0.5",
63
#           "Accept-Encoding": "gzip,deflate",
64
           "Accept-Charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.7",
65
#           "Range": "bytes=15335-17374",# experimental; didn't work
66
           "Keep-Alive": 3000,
67
           "Connection": "keep-alive",
68
           "Referer": "http://productsandservice.bellaliant.net/PS/pe/english/productsandservices/qualificationApartmentCheck.do",
69
#           "Cookie": "region=pe; language=en; ALIANTCON=7FmtLNjpM8NpwNfnKtt1nhnVQ30dGLyZGyZVvSjz311BvbJNbrhC!563379176!642852716; es_vis_ck=1263379648515_7410; es_session_ck=1263379648515_7588; WT_FPC=id=248ff551ebe935daaef1263379651212:lv=1263379867207:ss=1263379651212; AliantWebSurvey_Invited=Invited",
70
           "Content-Type": "application/x-www-form-urlencoded"}
71
 
72
# these are to make a 'long' delay every now and then, to
73
# a) commit the database, and
74
# b) allow more time for a SIGINT.
75
safetypausecount = 0 # current index in a block
76
safetypauseblock = 20 # number of full requests between pauses. Make larger?
77
safetypause = 3 # seconds to pause. Can be a float.
78
 
79
def checkAddress(sqlcon,town,road,civic,delay,status,lon,lat):
80
    """ Checks a single full civic address' DSL status on the Bell Aliant website.
81
    sqlcon: sqlite3.Cursor for FIB database
82
    town:   town of address
83
    road:   street of address
84
    civic:  int; civic number for address
85
    delay:  float; seconds to wait after request
86
    status: DSL status of address. Used to skip.
87
    lon:    float; longitude of address
88
    lat:    float; latitude of address"""
89
 
90
    # print a progress line for each address
91
    print "checking ", civic, road, ",", town, ":",
92
    sys.stdout.flush() # fix for progress line on Linux
93
 
94
##    # avoid re-checking items
95
##    if status > 0: # reserve -1 for errors, etc
96
##        print status," [ skipping ]"
97
##        sys.stdout.flush()
98
##        return
99
 
100
    try:
101
        # POST form value params
102
        # warning: using urllib mixes up the original header order
103
        params = urllib.urlencode({"province_code_id":"pe",
104
                                   "product_id":"2281",
105
                                   "streetName":road,
106
                                   "product_id":"2281",
107
                                   "communityName":town,
108
                                   "curbody":"51",
109
                                   "bodycont":"null",
110
                                   "section":"51",
111
                                   "subsection":"0",
112
                                   "community_id":"null",
113
                                   "apartment":"",
114
                                   "streetNumber":civic,
115
                                   "streetNumberSuffix":""})
116
        hc.connect() # seems to be needed, over dial-up, at least
117
        hc.request("POST",ba_handler,params,headers)
118
        print ".",
119
        sys.stdout.flush()
120
        hr = hc.getresponse()
121
        if hr.status != httplib.OK:
122
            print ">",hr.status,
123
            pass # maybe set an error flag?
124
        print ".",
125
        sys.stdout.flush()
126
        hrd = hr.read()
127
        print ".",
128
        sys.stdout.flush()
129
       # print hrd
130
        if hrd.find("<b>We're Sorry!</b>") != -1: # no dsl
131
            sqlcon.execute("update dsl set dsl=1 where civic=? and road=? and town=?",
132
                           [civic,road,town])
133
            print ": 1;",
134
            sys.stdout.flush()
135
        elif hrd.find("Ultra") != -1: # refine this
136
            sqlcon.execute("update dsl set dsl=3 where civic=? and road=? and town=?",
137
                           [civic,road,town])
138
            print ": 3;",
139
            sys.stdout.flush()
140
        elif hrd.find("<td>Congratulations! You can choose") != -1: # refine this
141
            sqlcon.execute("update dsl set dsl=2 where civic=? and road=? and town=?",
142
                           [civic,road,town])
143
            print ": 2;",
144
            sys.stdout.flush()
145
        else: # error
146
            sqlcon.execute("update dsl set dsl=-1 where civic=? and road=? and town=?",
147
                           [civic,road,town])
148
            print ": !! unparseable !!",
149
            sys.stdout.flush()
150
        print "ok...",
151
        sys.stdout.flush()
152
    except KeyboardInterrupt:
153
        print
154
        print "SIGINT received, exiting"
155
        sys.stdout.flush()
156
        safeexit()
157
    except httplib.HTTPException, httperr:
158
        try:
159
            sqlcon.execute("update dsl set dsl=-1 where civic=? and road=? and town=?",
160
                           [civic,road,town])
161
        except:
162
            pass
163
        print "!! error !!:"
164
        print httperr
165
        print sys.exc_info()[0]
166
        print ":continuing:"
167
        sys.stdout.flush()
168
    except Exception, errdet: # IO errors, net errors, etc.
169
        try:
170
            sqlcon.execute("update dsl set dsl=-1 where civic=? and road=? and town=?",
171
                           [civic,road,town])
172
        except:
173
            pass
174
        print "!! error !!:"
175
        print errdet
176
        print sys.exc_info()[0]
177
        print ":continuing:"
178
        sys.stdout.flush()
179
    finally:
180
        try:
181
            # sleep to reduce server load / pretend to not be a bot
182
            time.sleep((delay*(0.75+random.random()*0.5))) # semi-randomized time
183
            # if needed, kill the script with SIGINT during the sleep.
184
        except KeyboardInterrupt:
185
            safeexit()
186
        except: # can't have _anything_ stop crawler accidentally
187
            pass
188
    print "next"
189
    sys.stdout.flush()
190
 
191
    # safety pause, to save the database and allow for Ctrl-C killing:
192
    try:
193
        globals()['safetypausecount'] = safetypausecount + 1
194
        if safetypausecount >= safetypauseblock:
195
            globals()['safetypausecount'] = 0
196
            print "routine ",safetypause,"s delay to save the database and ease exiting:",
197
            sys.stdout.flush()
198
            qdbCon.commit()
199
            print "committed, sleeping...",
200
            sys.stdout.flush()
201
            time.sleep(safetypause)
202
            print "continuing."
203
            sys.stdout.flush()
204
    except KeyboardInterrupt:
205
        print "exiting:"
206
        safeexit()
207
    except Exception, errdet: # anything else
208
        print "!! error during pause !! :",errdet
209
        pass
210
 
211
 
212
def checkRobotsTxt():
213
    try:
214
        rfp = robotparser.RobotFileParser()
215
        rfp.set_url("http://"+ba_host+"/robots.txt")
216
        rfp.read()
217
        # logic below is a mess
218
        # Does RobotFileParser automatically handle wildcards?
219
        # This is sort of a token gesture, though.
220
        if not (rfp.can_fetch("*","http://"+ba_host+ba_handler) or
221
            rfp.can_fetch(bot_useragent,"http://"+ba_host+ba_handler)):
222
            print """
223
********************************************
224
********************************************
225
***                                      ***
226
*** A robots.txt file is denying access. ***
227
***                                      ***
228
*** Override? (y/*)                      ***"""
229
            ora = raw_input("*** ----->")
230
            if not ora == "y":
231
                print "****** Exiting: denied access due to robots.txt"
232
                exit(0)
233
            else:
234
                print "****** Continuing: robots.txt overridden"
235
    except KeyboardInterrupt:
236
        print "caught SIGINT, exiting"
237
        safeexit()
238
    except: # sloppy, I know. 
239
        print "Couldn't fetch robots.txt; continuing"
240
        # doesn't care if it doesn't load
241
 
242
 
243
 
244
def safeexit():
245
    """ stop crawling, hopefully saving all records
246
    and closing all streams."""
247
    try: # don't want an individual failure to prevent qdbCon.commit()
248
        hc.close()
249
    except:
250
        pass
251
    try:
252
        qdbC.close()
253
    except:
254
        pass
255
    try:
256
        qdbCon.commit()
257
        qdbCon.close()
258
    except:
259
        pass
260
    exit(0)
261
 
262
 
263
# start the bot. First confirm args are present
264
if len(sys.argv)<4:
265
    print """
266
Usage:  fib-crawlerbot.py (database.sqlite) (sparsity) (delay) (exclude)
267
 
268
         (sparsity)    minimum spacing between civic numbers to check
269
         (delay)       seconds between requests (randomized +/- 25%)
270
         (exclude)     whether to exclude the two cities (y/n)
271
 
272
"""
273
    exit(0)
274
 
275
 
276
qdbCon = sqlite3.connect(sys.argv[1]) # SQlite database connection
277
qdbC = qdbCon.cursor() # SQlite database cursor
278
 
279
checkRobotsTxt()
280
# start crawling:
281
hc = httplib.HTTPConnection(ba_host)
282
# use the shared module's processDB for parsing the database
283
# bool list is the dslflags param -- by default, only get unchecked houses
284
# may want to change this with new arg? and maxrows is False, for
285
# maximum sparseness
286
fibmod.processDB(qdbC,checkAddress,[True,False,False,False],False,int(sys.argv[2]),float(sys.argv[3]),fibmod.char2Bool(sys.argv[4]))
287
 
288
# done crawling, exit safely
289
safeexit()
290
 
291
 
292
 
293
# SQlite format is
294
# [civic]   [road]    [town]  [lat]   [long]   [dsl]
295
# dsl options: [ 0: unknown, 1: no, 2: basic, 3: ultra ]