I gave this presentation at Code Camp. As a data scientist and backcountry skier, I was interested in looking at fatal avalanche data. This covers scraping the data, analysis with Python, pandas and IPython Notebook. The final result is an infographic
21. requests
import requests as r
url = 'https://utahavalanchecenter.org/avalanches/fatalities'
req = r.get(url)
data = req.text
22. Scraping items
● Find <div class="content">
● Find <tr>'s
● Find <td>'s
– Find Names end of class attribute views-field-
field-killed
– Find Values string of <td>
● Also get details URL from <td class='views-
field-view-node'>
23. Code to Scrape
def get_info(data):
soup = BeautifulSoup(data)
content = soup.find(id="content")
trs = content.find_all('tr')
res = []
for tr in trs:
tds = tr.find_all('td')
data = {}
for td in tds:
name, value = get_field_name_value(td)
if not name:
continue
data[name] = value
if data:
res.append(data)
return res
24. Code to Scrape
def get_field_name_value(elem):
tags = elem.get('class')
start = 'views-field-field-'
for t in tags:
if t.startswith(start):
return t[len(start):],
''.join(elem.stripped_strings)
elif t == 'views-field-view-node':
return 'url', elem.a['href']
return None, None
28. Scraping Details
def get_avalanche_details(url, rows):
res = []
for item in rows:
req = r.get(url + item['url'])
data = req.text
soup = BeautifulSoup(data)
content = soup.find(id='content')
field_divs = content.find_all(class_='field')
for div in field_divs:
key_elem = div.find(class_='field-label')
if key_elem is None:
print "NONE!!!", div
continue
key = ''.join(key_elem.stripped_strings)
try:
value_elem = div.find(class_='field-item')
value = ''.join(value_elem.stripped_strings).
replace(u'xa0', u' ')
except AttributeError as e:
print e, div
if key in item:
continue
item[key] = value
res.append(item)
return res
29. BS Notes
Can be annoying to find strings:
>>> from bs4 import BeautifulSoup
>>> s = BeautifulSoup('<div>foo<div>bar</div></div>')
>>> s
<html><body><div>foo<div>bar</div></div></body></html>
>>> s.string # This bothers me! None!
>>> s.strings
<generator object _all_strings at 0x...>
>>> list(s.strings)
[u'foo', u'bar']
33. Unicode bytes!
Traceback (most recent call last):
File "crawl.py", line 73, in <module>
crawl('/tmp/ava.csv', 2)
File "crawl.py", line 69, in crawl
df.to_csv(outname)
...
lib.write_csv_rows(self.data, ix, self.nlevels,
self.cols, self.writer)
File "pandas/lib.pyx", line 978, in
pandas.lib.write_csv_rows (pandas/lib.c:16858)
UnicodeEncodeError: 'ascii' codec can't encode character
u'u200b' in position 70: ordinal not in range(128)
81. Enter gmaps
$ pip install gmaps
notebook code:
import gmaps
d2 = [x for x in zip(df.lat, df.lon) if
str(x[0]) != 'nan']
gmaps.heatmap(d2)
82.
83. Enter Folium
Wraps leaflet.js
from IPython.display import HTML
import folium
def inline_map(map):
"""
Embeds the HTML source of the map directly into the IPython notebook.
This method will not work if the map depends on any files (json data). Also this uses
the HTML5 srcdoc attribute, which may not be supported in all browsers.
"""
map._build_map()
return HTML('<iframe srcdoc="{srcdoc}" style="width: 100%; height: 510px; border:
none"></iframe>'.format(srcdoc=map.HTML.replace('"', '"')))
def summary(i, row):
return "<b>{} {} {} {}</b> <p>{}</p>".format(i, row['year'],
row['Trigger'], row['Location Name or Route'],
row['Accident and Rescue Summary'])
map = folium.Map(location=d2[4], zoom_start=10, tiles='Stamen Terrain', height=700)
for i, row in df2.iterrows():
#print (row.lat, row.lon)
if str(row.lat) == 'nan' or row.lat == 0:
continue
map.simple_marker([row.lat, row.lon], popup=summary(i, row))
inline_map(map)