Web scraping

import requests

r = requests.get('http://blue.math.buffalo.edu')

r

<Response [200]>

r.text

'<html>\n<style type="text/css">  \n \tBODY { background: url(Ringland4_cropped67pc.jpg); background-repeat: no-repeat; \t\tmargin-top: 10px; margin-left: 50px;\n\t} \n\tp1 \n\t{  \n\t \tfont-family: sans;  \n\t \tfont-size: 20pt;  \n        color: white;  \n\t}  \n \tp2 \n\t{  \n\t \tfont-family: sans;  \n\t \tfont-size: 12pt;  \n        color: white;  \n\t}  \n \tp3 \n\t{  \n\t \tfont-family: sans;  \n\t \tfont-size: 10pt;\n        color: white;  \n\t}  \n \tp4 \n\t{  \n\t \tfont-family: sans;  \n\t \tfont-size: 11pt; \n\t\tlink-color: white; \n        color: white;  \n\t}  \n</style>  \n\n<bgcolor=grey>\n<!--body background="big_orange_sticker.png"--> \n<p3>\n<br>\n</p3>\n<title>John Ringland, Mathematics Department, University at Buffalo</title>\n<!--img src=faculty_picture_board_ringland_20pc.jpg><br><br-->\n<!--img src=jr200907.jpg><br><br-->\n<!--img src=P8120090_crop.jpg><br><br-->\n<!--img src=P8150054_cropped_scaled.jpg-->\n<!--img src=jrtraced.png><br><br-->\n<!--img src=jrretraced.png><br><br-->\n<p></p>\n\n<p1>John Ringland</p1><br>\n<p2>Associate Professor of Mathematics</p2><br><br>\n<p3>Mathematics Department<br>\nUniversity at Buffalo<br>\n244 Mathematics Bldg<br>\nBuffalo NY 14260<br>\n716-645-8773<br>\nringland at math.buffalo.edu</p3><br><br>\n\n<p4><a style="color:white;text-decoration:none" \n   href=http://math.buffalo.edu/people/faculty_instructors.shtml>Up</a></p4> <font color=white>.</font> \n<p4><a style="color:white;text-decoration:none" \n   href=research.html>Research</a></p4> <font color=white>.</font> \n<p4><a style="color:white;text-decoration:none" \n   href=teaching.html>Teaching</a></p4> <font color=white>.</font> \n<p4><a style="color:white;text-decoration:none" \n   href=service.html>Service</a></p4> <font color=white>.</font> \n<p4><a style="color:white;text-decoration:none" \n   href=software.html>Software</a></p4>\n<p>\n<!--font face=tahoma color=darkgray>\nThis website was migrated on Jan 26, 2012 from orange.math.buffalo.edu.<br>\nIf you find any broken links, please let me know. Thanks!<-->\n'

ua = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299"

user_agent = {'User-agent': ua}
r = requests.get('http://blue.math.buffalo.edu',headers=user_agent)

import pandas

df = pandas.read_csv('airports.csv')

df.head()

def foo(x):
    return x
myconverters = {'continent':foo}
df = pandas.read_csv('airports.csv',converters=myconverters)
df.head(2)

df['type']

0              heliport
1         small_airport
2         small_airport
3         small_airport
4                closed
5         small_airport
6         small_airport
7         small_airport
8         small_airport
9              heliport
10               closed
11        small_airport
12             heliport
13        small_airport
14        small_airport
15             heliport
16             heliport
17        small_airport
18        small_airport
19             heliport
20        small_airport
21             heliport
22        small_airport
23        small_airport
24        small_airport
25             heliport
26             heliport
27        small_airport
28        small_airport
29             heliport
              ...      
53174    medium_airport
53175     large_airport
53176     small_airport
53177    medium_airport
53178    medium_airport
53179    medium_airport
53180    medium_airport
53181     small_airport
53182    medium_airport
53183     large_airport
53184    medium_airport
53185     small_airport
53186    medium_airport
53187    medium_airport
53188    medium_airport
53189    medium_airport
53190    medium_airport
53191    medium_airport
53192    medium_airport
53193    medium_airport
53194     small_airport
53195     large_airport
53196    medium_airport
53197     large_airport
53198    medium_airport
53199    medium_airport
53200    medium_airport
53201          heliport
53202     small_airport
53203     small_airport
Name: type, dtype: object

df[['latitude_deg','longitude_deg']]

len(df)

53204

import matplotlib.pyplot as plt
%matplotlib inline

plt.figure(figsize=(12,8))
plt.plot(df['longitude_deg'],df['latitude_deg'],'bo',alpha=0.05,ms=2)
plt.savefig('airports.png')

df['latitude_deg'].max()

82.75

imax = df['latitude_deg'].argmax()
imax

13355

df.iloc[imax]

id                                           320326
ident                                       CA-0605
type                                         closed
name                 Disraeli Inlet Water Aerodrome
latitude_deg                                  82.75
longitude_deg                                   -73
elevation_ft                                    NaN
continent                                        NA
iso_country                                      CA
iso_region                                    CA-NU
municipality                         Disraeli Inlet
scheduled_service                                no
gps_code                                        NaN
iata_code                                       NaN
local_code                                      SW6
home_link                                       NaN
wikipedia_link                                  NaN
keywords                                        NaN
Name: 13355, dtype: object

df['type'].value_counts()

small_airport     33779
heliport          10251
medium_airport     4542
closed             2995
seaplane_base      1021
large_airport       593
balloonport          23
Name: type, dtype: int64

df[['latitude_deg','longitude_deg']].to_csv('temp.csv',index=False,sep='\t')

df[['latitude_deg','longitude_deg']].to_excel('temp.xlsx')

?pandas.read_csv

	latitude_deg	longitude_deg
0	40.070801	-74.933601
1	38.704022	-101.473911
2	59.949200	-151.695999
3	34.864799	-86.770302
4	35.608700	-91.254898
5	34.942803	-97.818019
6	34.305599	-112.165001
7	35.350498	-116.888000
8	39.427188	-121.763427
9	32.727374	-116.459742
10	40.622202	-104.344002
11	28.645500	-82.219002
12	28.846600	-82.345398
13	27.230900	-80.969200
14	33.767502	-84.068298
15	33.884201	-84.733902
16	19.832715	-155.980233
17	48.145302	-116.213997
18	39.724028	-101.395994
19	41.644501	-87.122803
20	41.978401	-89.560402
21	41.511398	-87.260597
22	40.025600	-89.122902
23	38.727798	-94.930496
24	37.409401	-84.619698
25	30.191944	-90.980833
26	39.665298	-89.705597
27	30.136299	-92.429398
28	38.757099	-75.753799
29	43.949402	-86.416702
...	...	...
53174	43.030800	89.098700
53175	43.907101	87.474197
53176	43.955799	81.330299
53177	41.105301	122.853996
53178	43.996201	125.684998
53179	39.266667	122.666944
53180	41.538101	120.434998
53181	42.069014	121.718122
53182	48.199494	134.366447
53183	45.623402	126.250000
53184	50.171621	127.308884
53185	44.002201	126.396004
53186	46.843399	130.464996
53187	45.293000	131.193000
53188	41.101398	121.061996
53189	47.752056	129.019125
53190	32.836389	97.036389
53191	44.524101	129.569000
53192	52.912778	122.430000
53193	47.239601	123.917999
53194	52.224444	124.720222
53195	38.965698	121.539001
53196	42.253889	125.703333
53197	41.639801	123.483002
53198	42.882801	129.451004
53199	40.542524	122.358600
53200	41.784401	123.496002
53201	51.894444	1.482500
53202	-11.584278	47.296389
53203	30.784722	130.270556

	id	ident	type	name	latitude_deg	longitude_deg	elevation_ft	continent	iso_country	iso_region	municipality	scheduled_service	gps_code	iata_code	local_code	home_link	wikipedia_link	keywords
0	6523	00A	heliport	Total Rf Heliport	40.070801	-74.933601	11.0	NaN	US	US-PA	Bensalem	no	00A	NaN	00A	NaN	NaN	NaN
1	323361	00AA	small_airport	Aero B Ranch Airport	38.704022	-101.473911	3435.0	NaN	US	US-KS	Leoti	no	00AA	NaN	00AA	NaN	NaN	NaN
2	6524	00AK	small_airport	Lowell Field	59.949200	-151.695999	450.0	NaN	US	US-AK	Anchor Point	no	00AK	NaN	00AK	NaN	NaN	NaN
3	6525	00AL	small_airport	Epps Airpark	34.864799	-86.770302	820.0	NaN	US	US-AL	Harvest	no	00AL	NaN	00AL	NaN	NaN	NaN
4	6526	00AR	closed	Newport Hospital & Clinic Heliport	35.608700	-91.254898	237.0	NaN	US	US-AR	Newport	no	NaN	NaN	NaN	NaN	NaN	00AR