Web scraping

In [1]:
import requests
In [6]:
r = requests.get('http://blue.math.buffalo.edu')
In [3]:
r
Out[3]:
<Response [200]>
In [4]:
r.text
Out[4]:
'<html>\n<style type="text/css">  \n \tBODY { background: url(Ringland4_cropped67pc.jpg); background-repeat: no-repeat; \t\tmargin-top: 10px; margin-left: 50px;\n\t} \n\tp1 \n\t{  \n\t \tfont-family: sans;  \n\t \tfont-size: 20pt;  \n        color: white;  \n\t}  \n \tp2 \n\t{  \n\t \tfont-family: sans;  \n\t \tfont-size: 12pt;  \n        color: white;  \n\t}  \n \tp3 \n\t{  \n\t \tfont-family: sans;  \n\t \tfont-size: 10pt;\n        color: white;  \n\t}  \n \tp4 \n\t{  \n\t \tfont-family: sans;  \n\t \tfont-size: 11pt; \n\t\tlink-color: white; \n        color: white;  \n\t}  \n</style>  \n\n<bgcolor=grey>\n<!--body background="big_orange_sticker.png"--> \n<p3>\n<br>\n</p3>\n<title>John Ringland, Mathematics Department, University at Buffalo</title>\n<!--img src=faculty_picture_board_ringland_20pc.jpg><br><br-->\n<!--img src=jr200907.jpg><br><br-->\n<!--img src=P8120090_crop.jpg><br><br-->\n<!--img src=P8150054_cropped_scaled.jpg-->\n<!--img src=jrtraced.png><br><br-->\n<!--img src=jrretraced.png><br><br-->\n<p></p>\n\n<p1>John Ringland</p1><br>\n<p2>Associate Professor of Mathematics</p2><br><br>\n<p3>Mathematics Department<br>\nUniversity at Buffalo<br>\n244 Mathematics Bldg<br>\nBuffalo NY 14260<br>\n716-645-8773<br>\nringland at math.buffalo.edu</p3><br><br>\n\n<p4><a style="color:white;text-decoration:none" \n   href=http://math.buffalo.edu/people/faculty_instructors.shtml>Up</a></p4> <font color=white>.</font> \n<p4><a style="color:white;text-decoration:none" \n   href=research.html>Research</a></p4> <font color=white>.</font> \n<p4><a style="color:white;text-decoration:none" \n   href=teaching.html>Teaching</a></p4> <font color=white>.</font> \n<p4><a style="color:white;text-decoration:none" \n   href=service.html>Service</a></p4> <font color=white>.</font> \n<p4><a style="color:white;text-decoration:none" \n   href=software.html>Software</a></p4>\n<p>\n<!--font face=tahoma color=darkgray>\nThis website was migrated on Jan 26, 2012 from orange.math.buffalo.edu.<br>\nIf you find any broken links, please let me know. Thanks!<-->\n'
In [5]:
ua = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299"
In [7]:
user_agent = {'User-agent': ua}
r = requests.get('http://blue.math.buffalo.edu',headers=user_agent)
In [8]:
import pandas
In [9]:
df = pandas.read_csv('airports.csv')
In [10]:
df.head()
Out[10]:
id ident type name latitude_deg longitude_deg elevation_ft continent iso_country iso_region municipality scheduled_service gps_code iata_code local_code home_link wikipedia_link keywords
0 6523 00A heliport Total Rf Heliport 40.070801 -74.933601 11.0 NaN US US-PA Bensalem no 00A NaN 00A NaN NaN NaN
1 323361 00AA small_airport Aero B Ranch Airport 38.704022 -101.473911 3435.0 NaN US US-KS Leoti no 00AA NaN 00AA NaN NaN NaN
2 6524 00AK small_airport Lowell Field 59.949200 -151.695999 450.0 NaN US US-AK Anchor Point no 00AK NaN 00AK NaN NaN NaN
3 6525 00AL small_airport Epps Airpark 34.864799 -86.770302 820.0 NaN US US-AL Harvest no 00AL NaN 00AL NaN NaN NaN
4 6526 00AR closed Newport Hospital & Clinic Heliport 35.608700 -91.254898 237.0 NaN US US-AR Newport no NaN NaN NaN NaN NaN 00AR
In [12]:
def foo(x):
    return x
myconverters = {'continent':foo}
df = pandas.read_csv('airports.csv',converters=myconverters)
df.head(2)
Out[12]:
id ident type name latitude_deg longitude_deg elevation_ft continent iso_country iso_region municipality scheduled_service gps_code iata_code local_code home_link wikipedia_link keywords
0 6523 00A heliport Total Rf Heliport 40.070801 -74.933601 11.0 NA US US-PA Bensalem no 00A NaN 00A NaN NaN NaN
1 323361 00AA small_airport Aero B Ranch Airport 38.704022 -101.473911 3435.0 NA US US-KS Leoti no 00AA NaN 00AA NaN NaN NaN
In [13]:
df['type']
Out[13]:
0              heliport
1         small_airport
2         small_airport
3         small_airport
4                closed
5         small_airport
6         small_airport
7         small_airport
8         small_airport
9              heliport
10               closed
11        small_airport
12             heliport
13        small_airport
14        small_airport
15             heliport
16             heliport
17        small_airport
18        small_airport
19             heliport
20        small_airport
21             heliport
22        small_airport
23        small_airport
24        small_airport
25             heliport
26             heliport
27        small_airport
28        small_airport
29             heliport
              ...      
53174    medium_airport
53175     large_airport
53176     small_airport
53177    medium_airport
53178    medium_airport
53179    medium_airport
53180    medium_airport
53181     small_airport
53182    medium_airport
53183     large_airport
53184    medium_airport
53185     small_airport
53186    medium_airport
53187    medium_airport
53188    medium_airport
53189    medium_airport
53190    medium_airport
53191    medium_airport
53192    medium_airport
53193    medium_airport
53194     small_airport
53195     large_airport
53196    medium_airport
53197     large_airport
53198    medium_airport
53199    medium_airport
53200    medium_airport
53201          heliport
53202     small_airport
53203     small_airport
Name: type, dtype: object
In [14]:
df[['latitude_deg','longitude_deg']]
Out[14]:
latitude_deg longitude_deg
0 40.070801 -74.933601
1 38.704022 -101.473911
2 59.949200 -151.695999
3 34.864799 -86.770302
4 35.608700 -91.254898
5 34.942803 -97.818019
6 34.305599 -112.165001
7 35.350498 -116.888000
8 39.427188 -121.763427
9 32.727374 -116.459742
10 40.622202 -104.344002
11 28.645500 -82.219002
12 28.846600 -82.345398
13 27.230900 -80.969200
14 33.767502 -84.068298
15 33.884201 -84.733902
16 19.832715 -155.980233
17 48.145302 -116.213997
18 39.724028 -101.395994
19 41.644501 -87.122803
20 41.978401 -89.560402
21 41.511398 -87.260597
22 40.025600 -89.122902
23 38.727798 -94.930496
24 37.409401 -84.619698
25 30.191944 -90.980833
26 39.665298 -89.705597
27 30.136299 -92.429398
28 38.757099 -75.753799
29 43.949402 -86.416702
... ... ...
53174 43.030800 89.098700
53175 43.907101 87.474197
53176 43.955799 81.330299
53177 41.105301 122.853996
53178 43.996201 125.684998
53179 39.266667 122.666944
53180 41.538101 120.434998
53181 42.069014 121.718122
53182 48.199494 134.366447
53183 45.623402 126.250000
53184 50.171621 127.308884
53185 44.002201 126.396004
53186 46.843399 130.464996
53187 45.293000 131.193000
53188 41.101398 121.061996
53189 47.752056 129.019125
53190 32.836389 97.036389
53191 44.524101 129.569000
53192 52.912778 122.430000
53193 47.239601 123.917999
53194 52.224444 124.720222
53195 38.965698 121.539001
53196 42.253889 125.703333
53197 41.639801 123.483002
53198 42.882801 129.451004
53199 40.542524 122.358600
53200 41.784401 123.496002
53201 51.894444 1.482500
53202 -11.584278 47.296389
53203 30.784722 130.270556

53204 rows × 2 columns

In [15]:
len(df)
Out[15]:
53204
In [16]:
import matplotlib.pyplot as plt
%matplotlib inline
In [25]:
plt.figure(figsize=(12,8))
plt.plot(df['longitude_deg'],df['latitude_deg'],'bo',alpha=0.05,ms=2)
plt.savefig('airports.png')
In [26]:
df['latitude_deg'].max()
Out[26]:
82.75
In [29]:
imax = df['latitude_deg'].argmax()
imax
Out[29]:
13355
In [30]:
df.iloc[imax]
Out[30]:
id                                           320326
ident                                       CA-0605
type                                         closed
name                 Disraeli Inlet Water Aerodrome
latitude_deg                                  82.75
longitude_deg                                   -73
elevation_ft                                    NaN
continent                                        NA
iso_country                                      CA
iso_region                                    CA-NU
municipality                         Disraeli Inlet
scheduled_service                                no
gps_code                                        NaN
iata_code                                       NaN
local_code                                      SW6
home_link                                       NaN
wikipedia_link                                  NaN
keywords                                        NaN
Name: 13355, dtype: object
In [31]:
df['type'].value_counts()
Out[31]:
small_airport     33779
heliport          10251
medium_airport     4542
closed             2995
seaplane_base      1021
large_airport       593
balloonport          23
Name: type, dtype: int64
In [33]:
df[['latitude_deg','longitude_deg']].to_csv('temp.csv',index=False,sep='\t')
In [34]:
df[['latitude_deg','longitude_deg']].to_excel('temp.xlsx')
In [35]:
?pandas.read_csv