Wednesday, April 8, 2015

Web Scraping using R

setwd("C:\\Test")

#================================Example USPS Web Data===============================
#Scrape all the information about the USPS postoffice location from web#
#Read URL lines
#One can chage the "=06484" part to get information about other ZIP areas
r.page2 <- readLines("https://tools.usps.com/go/POLocatorAction.action?
                     locationTypeQ=all&address=06484&tAddress=&tAddress1Ams=&
                     tAddress2Ams=&tCityAms=&tStateAms=&tZipAms=&
                     tCarrierRouteAms=&sWithin=20&refineSearchState=&
                     refineSearchTwistState=&bSearch=Search")

#Regular Expressions of different info needed
LName_pattern <- 'locationName=([^&]*)'
lon_pattern <- 'longitude=([^&]*)'
lat_pattern <- 'latitude=([^&]*)' 
add_pattern <- 'address1=([^&]*)' 
city_pattern <- 'city=([^&]*)' 
state_pattern <- 'state=([^&]*)' 
zip_pattern <- 'zip5=([^&]*)' 

#See what the dataline looks like
grep(lon_pattern, r.page2[1: length(r.page2)], value = T)

#Difine two funs
getexpr <- function(s, g)substring(s, g, g+attr(g, 'match.length')-1)

WebScrape <- function(mypattern, r.page){
  datalines <- grep(mypattern, r.page[1: length(r.page)], value = T)
  gg <- gregexpr(mypattern, datalines)
  matches <- mapply(getexpr, datalines, gg)
  result <- gsub(mypattern, '\\1', matches)
  names(result) = NULL
  return(result)
}

#Save the results into variables
LName <- WebScrape(LName_pattern, r.page2)
lat <- WebScrape(lat_pattern, r.page2)
lon <- WebScrape(lon_pattern, r.page2)
add <- WebScrape(add_pattern, r.page2)
city <- WebScrape(city_pattern, r.page2)
state <- WebScrape(state_pattern, r.page2)
zip <- WebScrape(zip_pattern, r.page2)

#Output the overall results

USPS_Location <- cbind(LName, lat, lon, add, city, state, zip)