In [163]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from geopy.distance import geodesic
import re
from matplotlib.colors import LinearSegmentedColormap
import numpy as np
from shapely.geometry import Point, shape
from shapely.ops import unary_union
from shapely.prepared import prep
import json
In [164]:
# Read all tables from the Wikipedia page
url = "https://en.wikipedia.org/wiki/List_of_national_parks_of_Australia"
tables = pd.read_html(url)
names = []
lats = []
longs = []
def extract_coordinates(coord_str):
if pd.isna(coord_str):
return None, None
# Extract latitude and longitude using regex
pattern = r'(-?\d+\.?\d*°?S).*?(-?\d+\.?\d*°?E)'
match = re.search(pattern, coord_str)
if match:
lat_str, long_str = match.groups()
# Convert to decimal degrees
lat = float(lat_str.replace('°S', '').replace('°', '')) * -1 # Southern hemisphere is negative
long = float(long_str.replace('°E', '').replace('°', ''))
return lat, long
return None, None
# Process each table
for table in tables:
if 'Name' in table.columns and 'Coordinates' in table.columns:
for _, row in table.iterrows():
name = row['Name']
if isinstance(name, str) and 'National Park' in name:
# Clean the name by removing links and citations
clean_name = re.sub(r'\[.*?\]', '', name)
# Extract coordinates
lat, long = extract_coordinates(row['Coordinates'])
if lat is not None and long is not None:
names.append(clean_name)
lats.append(lat)
longs.append(long)
# Create the DataFrame
national_parks = pd.DataFrame({
'name': names,
'latitude': lats,
'longitude': longs
})
# Display the first few rows
print(national_parks.head())
name latitude longitude 0 Namadgi National Park -35.666667 148.950000 1 Abercrombie River National Park -34.093900 149.708000 2 Arakwal National Park -28.660300 153.621000 3 Bago Bluff National Park -31.532200 152.629000 4 Bald Rock National Park -28.852500 152.055556
In [171]:
# Optimize distance calculations using vectorization
def calculate_distances(parks_df, lat_grid, lon_grid):
# Create meshgrid of latitude and longitude
lat_mesh, lon_mesh = np.meshgrid(lat_grid, lon_grid)
# Reshape park coordinates for broadcasting
park_lats = parks_df['latitude'].values[:, np.newaxis, np.newaxis]
park_lons = parks_df['longitude'].values[:, np.newaxis, np.newaxis]
# Calculate distances using haversine formula
R = 6371 # Earth's radius in kilometers
lat1, lat2 = np.radians(lat_mesh), np.radians(park_lats)
lon1, lon2 = np.radians(lon_mesh), np.radians(park_lons)
dlat = lat2 - lat1
dlon = lon2 - lon1
a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
c = 2 * np.arcsin(np.sqrt(a))
distances = R * c
# Get minimum distance for each point
min_distances = np.min(distances, axis=0)
return min_distances.T # Transpose to match original orientation
# Calculate grid points (can save these)
latitudes = np.linspace(-45, -5, 400)
longitudes = np.linspace(110, 160, 400)
distances = calculate_distances(national_parks, latitudes, longitudes)
# Save the calculated distances
np.save('australia_distances.npy', distances)
In [172]:
def get_australia_geojson():
with open('australia-with-states_782.geojson') as f:
geojson = json.load(f)
# Extract all state geometries with a small buffer
geometries = []
for feature in geojson['features']:
state = shape(feature['geometry'])
# Add a very small buffer to handle topology issues
state = state.buffer(0.0001)
geometries.append(state)
# Combine all states
australia = unary_union(geometries)
return australia
In [173]:
australia = get_australia_geojson()
australia_shape = shape(australia)
australia_shape.boundary
Out[173]: