79714084

Date: 2025-07-25 00:48:42
Score: 0.5
Natty:
Report link

Update to PJ- Answer: Scraping All Property Titles from Map Using Bounding Box and Selenium

After troubleshooting, the issue was that the original point-based query only fetched one property, and the gistoken and mapserverUrl are dynamically loaded via JavaScript (not in static HTML, so requests alone fails with IndexError). The solution uses a bounding box for the map area and Selenium to extract the token/URL.

Key Changes:

Requirements: pip install requests selenium webdriver-manager

Working Script:

import re
import json
import urllib3
import requests
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

link = 'https://www.bcassessment.ca/Property/Info/SjAwMDAwQzRZMQ=='

headers = {
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
    'accept-language': 'en-US,en;q=0.9',
    'host': 'www.bcassessment.ca',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36'
}

headers_ano = {
    'accept': '*/*',
    'accept-language': 'en-US,en;q=0.9',
    'host': 'arcgis.bcassessment.ca',
    'origin': 'https://www.bcassessment.ca',
    'referer': 'https://www.bcassessment.ca/',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36'
}

# Bounding box (adjust as needed; see browser Dev Tools > Network for values)
bounding_box = {
    'xmin': -13742000,
    'ymin': 6199100,
    'xmax': -13741000,
    'ymax': 6199300,
    'spatialReference': {'wkid': 102100, 'latestWkid': 3857}
}

# Selenium setup (suppress logs)
options = Options()
options.add_argument('--log-level=3')
options.add_experimental_option('excludeSwitches', ['enable-logging'])
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
driver.get(link)
time.sleep(5)  # Wait for JS load

page_source = driver.page_source
driver.quit()

# Extract token and mapserver URL
token_matches = re.findall(r"var\s*gistoken\s*=\s*['\"](.*?)['\"]", page_source)
token = token_matches[0] if token_matches else None
mapserver_matches = re.findall(r"var\s*mapserverUrl\s*=\s*['\"](.*?)['\"]", page_source)
mapserver_url = mapserver_matches[0] if mapserver_matches else None

if not token or not mapserver_url:
    print("Failed to extract token or mapserver URL. Site may have changed.")
    exit()

# Query params with pagination
params = {
    'f': 'json',
    'where': '',
    'geometry': json.dumps(bounding_box),
    'geometryType': 'esriGeometryEnvelope',
    'inSR': '102100',
    'outFields': 'ADDRESS,STREET_NAME,STREET_NUMBER',
    'orderByFields': 'STREET_NAME,STREET_NUMBER',
    'returnGeometry': 'false',
    'outSR': '102100',
    'resultRecordCount': '1000',
    'resultOffset': '0'
}

# Fetch and paginate
with requests.Session() as session:
    params['token'] = token
    query_url = f"{mapserver_url}/0/query"
    offset = 0
    addresses = []
    while True:
        params['resultOffset'] = str(offset)
        res = session.get(query_url, params=params, headers=headers_ano, verify=False)
        if res.status_code != 200:
            print(f"Failed: {res.status_code}")
            break
        data = res.json()
        features = data.get('features', [])
        if not features:
            break
        addresses.extend([item['attributes']['ADDRESS'] for item in features])
        if not data.get('exceededTransferLimit', False):
            break
        offset += 1000

# Print and save
with open('properties.txt', 'w') as f:
    for address in addresses:
        print(address)
        f.write(address + '\n')

Notes:

This fetches all visible titles in the box. Adjust outFields for more data (e.g., TOTAL_ASSESSED).

To answer your question:

Could you shed some light on how I generate these values: xmin = -13742000, ymin = 6199100, xmax = -13741000, ymax = 6199300

If you don't want to get the Bounding Box with Dev Tools I made this program to get the coordinates by just entering a address and box size around address you want. May not be 100% accurate but should work fine. Example: python geocode_to_bounding_box.py -a 123 Main St, Springfield 12345 -b 1x0.2

import requests
import math
import json
import argparse
import sys

# Function to convert lat/lon to Web Mercator (WKID 102100)
def latlon_to_mercator(lat, lon):
    x = lon * 20037508.34 / 180
    y = math.log(math.tan((90 + lat) * math.pi / 360)) / (math.pi / 180) * 20037508.34 / 180
    return x, y

# Set up argument parser
parser = argparse.ArgumentParser(
    description="Convert an address to a Web Mercator bounding box.",
    epilog="Example: python geocode_to_bounding_box.py -a 123 Main St, Springfield 12345 -b 1x0.2\n"
           "Note: For addresses with special characters, enclose in quotes if required by your shell."
)
parser.add_argument(
    "-a", "--address",
    nargs='+',
    required=True,
    help="Address to geocode (e.g., 123 Main St, Springfield 12345; quotes needed for addresses with special characters)"
)
parser.add_argument(
    "-b", "--box-size",
    required=True,
    help="Bounding box size in kilometers (e.g., 1x0.2 for 1km wide, 0.2km tall)"
)
parser.add_argument(
    "-m", "--meters",
    action="store_true",
    help="Interpret box size as meters instead of kilometers"
)

# Parse arguments
args = parser.parse_args()

# Process address
address = ' '.join(args.address).strip()

# Geocode using Nominatim API
geocode_url = f"https://nominatim.openstreetmap.org/search?q={address}&format=json&limit=1"
headers = {'User-Agent': 'GeocodingScript/1.0'}

try:
    response = requests.get(geocode_url, headers=headers, timeout=10)
    response.raise_for_status()  # Raise HTTPError for bad responses (4xx/5xx)
except requests.exceptions.RequestException as e:
    print(f"Geocoding failed: {e}")
    sys.exit(1)

data = response.json()
if not data:
    print("No results found for the address.")
    sys.exit(1)

lat = float(data[0]['lat'])
lon = float(data[0]['lon'])

# Validate latitude and longitude
if not (-90 <= lat <= 90) or not (-180 <= lon <= 180):
    print("Invalid coordinates: Latitude and longitude out of range.")
    sys.exit(1)

if abs(lat) > 85:
    print("Warning: Locations near the poles may have projection issues in Web Mercator.")

print(f"Found coordinates: Latitude {lat}, Longitude {lon}")

# Process box size
try:
    parts = args.box_size.split('x')
    if len(parts) != 2:
        raise ValueError("Invalid format. Must be exactly WIDTHxHEIGHT.")
    width = abs(float(parts[0]))
    height = abs(float(parts[1]))
    if width == 0 or height == 0:
        print("Warning: Box size of zero creates a point, not a bounding box.")
except ValueError as e:
    print(f"Invalid box size: {e}")
    sys.exit(1)

# Convert to meters if kilometers (default)
if not args.meters:
    width *= 1000
    height *= 1000

# Convert to Mercator
try:
    center_x, center_y = latlon_to_mercator(lat, lon)
except ValueError as e:
    print(f"Coordinate conversion failed: {e} (likely due to invalid latitude)")
    sys.exit(1)

# Compute bounding box
half_width = width / 2
half_height = height / 2
bounding_box = {
    "xmin": center_x - half_width,
    "ymin": center_y - half_height,
    "xmax": center_x + half_width,
    "ymax": center_y + half_height,
    "spatialReference": {"wkid": 102100, "latestWkid": 3857}
}

# Display as JSON
print(json.dumps(bounding_box, indent=4))
Reasons:
  • Blacklisted phrase (1): THx
  • Long answer (-1):
  • Has code block (-0.5):
  • Low reputation (1):
Posted by: bigjokker