import numpy as np
import pandas as pd
import json
import datetime
To extract my location history, I used Takeout, a service that allows users to download a copy of their data. Takeout covers various products and allows users to select between several file formats. I requested an archive of my location history in JSON format.
I loaded the data into a Pandas DataFrame, converted to usable units, and dropped unusable data points.
# Load the file
with open('LocationHistory.json', 'r') as fh:
raw = json.loads(fh.read())
# Create the DataFrame
ld = pd.DataFrame(raw['locations'])
del raw
# Convert to typical units
ld['latitudeE7'] = ld['latitudeE7'] / float(1e7)
ld['longitudeE7'] = ld['longitudeE7'] / float(1e7)
ld['timestampMs'] = ld['timestampMs'].map(lambda x: float(x) / 1000)
ld['datetime'] = ld['timestampMs'].map(datetime.datetime.fromtimestamp)
# Rename fields based on the conversions we just did
ld.rename(columns={'latitudeE7':'latitude', 'longitudeE7':'longitude', 'timestampMs':'timestamp'}, inplace=True)
In preliminary mapping, I found several coordinates outside of where I had traveled in 2014. Some coordinates were in Italy. Since I did not travel east of 115° W, though I would have liked to, I removed the erroneous points.
# Ignore locations with accuracy estimates over 1000m
ld = ld[ld.accuracy < 1000]
ld.reset_index(drop=True, inplace=True)
# Remove erroneous data
ld = ld[ld['longitude'] < -115.0]
Next, I want to keep only the 2014 data.
# Create a year variable
ld['year'] = ld['datetime'].apply(lambda x: x.year)
# 2014 only
ld_2014 = ld[ld['year'] == 2014].sort(['timestamp'], ascending = [1]).reset_index(drop=True)
len(ld_2014)
This section serves to obfuscate the few places I frequent often.
lat0 = XXXX
lon0 = XXXX
lat1 = XXXX
lon1 = XXXX
ld_2014['lat_rounded'] = ld_2014['latitude'].apply(lambda x: round(x, 3))
ld_2014['lon_rounded'] = ld_2014['longitude'].apply(lambda x: round(x, 3))
place0 = ld_2014[((ld_2014['lat_rounded'] == lat0) & (ld_2014['lon_rounded'] == lon0))]
place0.reset_index(drop=True, inplace=True)
place1 = ld_2014[((ld_2014['lat_rounded'] == lat1) & (ld_2014['lon_rounded'] == lon1))]
place1.reset_index(drop=True, inplace=True)
place0_sample = place0.ix[np.random.choice(place0.index, 0.050 * len(place0))]
place1_sample = place1.ix[np.random.choice(place1.index, 0.010 * len(place1))]
ld_2014_remaining = ld_2014[((ld_2014['lat_rounded'] != lat0) | (ld_2014['lon_rounded'] != lon0))]
ld_2014_remaining = ld_2014_remaining[((ld_2014_remaining['lat_rounded'] != lat1) | (ld_2014_remaining['lon_rounded'] != lon1))]
ld_2014_usable = ld_2014_remaining.append(place0_sample).append(place1_sample)
ld_2014_usable.reset_index(drop=True, inplace=True)
I first wanted to plot the coordinates. However, given the volume of data--the browser slows when trying to render that many objects--I decided to take a random sample of 20,000 points.
ld_2014_sample = ld_2014_usable.ix[np.random.choice(ld_2014_usable.index, 20000)]
Here, I create a list of tuples of coordinates based on the sample to iterate through and output to a .geojson file.
coords = []
for row in ld_2014_sample.index:
coords.append((ld_2014['latitude'][row], ld_2014['longitude'][row]))
Output the points to a GeoJSON file and modify a JavaScript file.
import json
geo_data = {
'type': 'FeatureCollection',
'features': []
}
#for points in coords[441139:451139]:
for points in coords:
# Each point is a GeoJSON "feature"
feature = {
'type': 'Feature',
'geometry': {
"type": "Point",
"coordinates": [float(points[1]), float(points[0])]
},
# A feature's "properties" become attribute columns in GIS
'properties': {
'points': points
}
}
# Add the feature into the GeoJSON wrapper
geo_data['features'].append(feature)
with open('location_hist_2014.geojson', 'wb') as f:
json.dump(geo_data, f, indent=2)
Next, I add the variable name to the top of the file, which I'll use in my Leaflet map, and save is as .js.
with open('location_hist_2014.geojson', 'rb') as infile:
lines = infile.readlines()
with open('location_hist_2014.js', 'wb') as outfile:
outfile.write('var location_2014 = ')
outfile.writelines(lines)
infile.close()
outfile.close()
Even with only 4.4% of my location data, the page was laggy.
Knowing that I wanted to create a hexbin layer, I output all of the usable data.
location_hist_for_csv = ld_2014_usable.loc[:, ['longitude', 'latitude']]
location_hist_for_csv.to_csv('location_hist_2014.csv', index=False)