We are working on a project for our University on inside Airbnb. We load listings of a lot of different cities and want to load all the data into one dataframe binded to eachother. However, we fixed this problem. Now we need to add a column for each dataset saying to which city it belongs. I thought of something like: data_files[i] <- cbind(data_files[i], City = data_files[i])
. However, this does not work, and especially not in the lapply function.
Downloading the data:
library('tidyverse')
library('dplyr')
Airbnb_listing_urls <- read.delim("Airbnb_listing_urls.csv", sep = ';', encoding='UTF-8')
#filter for all cities in EU
countries_in_EU <- c('Austria', 'Belgium', 'Bulgaria', 'Croatia', 'Republic of Cyprus', 'Czech Republic', 'Denmark',
'Estonia', 'Finland', 'France', 'Germany', 'Greece', 'Hungary', 'Italy', 'Latvia', 'Lithuania',
'Luxembourg', 'The Netherlands', 'Poland', 'Portugal', 'Romania', 'Slovakia', 'Slovenia', 'Spain', 'Sweden')
#Delete white space of country
tmp <- sub('.', '', Airbnb_listing_urls$Country)
#Add the countries without the whitespace to df
Airbnb_listing_urls$Country <- tmp
#Filter for countries in EU
listing_EU <- Airbnb_listing_urls %>% filter(Country %in% countries_in_EU)
#Change characters in order to prevent error while downloading data
listing_EU$Link=gsub('é', '%C3%A9', listing_EU$Link)
listing_EU$Link=gsub('í', '%C3%AD', listing_EU$Link)
listing_EU$Link=gsub('ä', '%C3%A4', listing_EU$Link)
listing_EU$Link=gsub('ü', '%C3%BC', listing_EU$Link)
dir.create('cities')
for (i in 1:42) {
myurl <- paste(listing_EU[i,3], sep = "")
myfile <- paste0("cities/", listing_EU$City[i], ".csv")
download.file(url = myurl, destfile = myfile)
}
Furthermore, for the question we need to solve this:
library('tidyverse')
data_files <- list.files("cities/") # Load city files
data_files
all_data=lapply(data_files[1:2], function(fn) {
read_csv(paste0("cities/", fn))
})
df <- bind_rows(all_data)
With the following dataset:
structure(list(Country = c(" The Netherlands", " Belgium", " United States",
" Greece", " United States", " Thailand", " Spain", " Australia",
" Australia", " China", " Belize", " Italy", " Germany", " Italy",
" France", " United States", " United Kingdom", " United States",
" Belgium", " Argentina", " United States", " South Africa",
" United States", " United States", " United States", " Denmark",
" Greece", " United States", " United States", " Ireland", " United Kingdom",
" Spain", " Italy", " United States", " Switzerland", " Belgium",
" Spain", " United Kingdom", " United States", " China", " Turkey",
" United States", " Portugal", " United Kingdom", " United States",
" France", " Spain", " Spain", " Spain", " Australia", " Spain",
" Mexico", " Australia", " Italy", " Canada", " Australia", " Germany",
" Italy", " United States", " Canada", " United States", " United States",
" United States", " Australia", " United States", " Norway",
" United States", " France", " France", " United States", " Portugal",
" Czech Republic", " Italy", " Canada", " United States", " Latvia",
" Brazil", " Italy", " The Netherlands", " United States", " United States",
" United States", " United States", " United States", " United States",
" Chile", " United States", " Spain", " China", " Italy", " Singapore",
" Greece", " Sweden", " Australia", " Taiwan", " Australia",
" The Netherlands", " Greece", " Japan", " Canada", " Italy",
" United States", " Spain", " Canada", " Switzerland", " Italy",
" Canada", " Austria", " United States", " Australia", " Canada",
" Switzerland", "Ireland", "Malta", "New Zealand"), City = c("Amsterdam",
"Antwerp", "Asheville", "Athens", "Austin", "Bangkok", "Barcelona",
"Barossa Valley", "Barwon South West", "Beijing", "Belize", "Bergamo",
"Berlin", "Bologna", "Bordeaux", "Boston", "Bristol", "Broward County",
"Brussels", "Buenos Aires", "Cambridge", "Cape Town", "Chicago",
"Clark County", "Columbus", "Copenhagen", "Crete", "Dallas",
"Denver", "Dublin", "Edinburgh", "Euskadi", "Florence", "Fort Worth",
"Geneva", "Ghent", "Girona", "Greater Manchester", "Hawaii",
"Hong Kong", "Istanbul", "Jersey City", "Lisbon", "London", "Los Angeles",
"Lyon", "Madrid", "Malaga", "Mallorca", "Melbourne", "Menorca",
"Mexico City", "Mid North Coast", "Milan", "Montreal", "Mornington Peninsula",
"Munich", "Naples", "Nashville", "New Brunswick", "New Orleans",
"New York City", "Newark", "Northern Rivers", "Oakland", "Oslo",
"Pacific Grove", "Paris", "Pays Basque", "Portland", "Porto",
"Prague", "Puglia", "Quebec City", "Rhode Island", "Riga", "Rio de Janeiro",
"Rome", "Rotterdam", "Salem", "San Diego", "San Francisco", "San Mateo County",
"Santa Clara County", "Santa Cruz County", "Santiago", "Seattle",
"Sevilla", "Shanghai", "Sicily", "Singapore", "South Aegean",
"Stockholm", "Sydney", "Taipei", "Tasmania", "The Hague", "Thessaloniki",
"Tokyo", "Toronto", "Trentino", "Twin Cities MSA", "Valencia",
"Vancouver", "Vaud", "Venice", "Victoria", "Vienna", "Washington",
"Western Australia", "Winnipeg", "Zurich", "Ireland", "Malta",
"New Zealand"), Link = c("http://data.insideairbnb.com/the-netherlands/north-holland/amsterdam/2022-09-07/data/listings.csv.gz",
"http://data.insideairbnb.com/belgium/vlg/antwerp/2022-06-22/data/listings.csv.gz",
"http://data.insideairbnb.com/united-states/nc/asheville/2022-09-14/data/listings.csv.gz",
"http://data.insideairbnb.com/greece/attica/athens/2022-06-17/data/listings.csv.gz",
"http://data.insideairbnb.com/united-states/tx/austin/2022-09-12/data/listings.csv.gz",
"http://data.insideairbnb.com/thailand/central-thailand/bangkok/2022-06-20/data/listings.csv.gz",
"http://data.insideairbnb.com/spain/catalonia/barcelona/2022-09-10/data/listings.csv.gz",
"http://data.insideairbnb.com/australia/sa/barossa-valley/2022-06-21/data/listings.csv.gz",
"http://data.insideairbnb.com/australia/vic/barwon-south-west-vic/2022-06-22/data/listings.csv.gz",
"http://data.insideairbnb.com/china/beijing/beijing/2022-06-21/data/listings.csv.gz",
"http://data.insideairbnb.com/belize/bz/belize/2022-06-22/data/listings.csv.gz",
"http://data.insideairbnb.com/italy/lombardia/bergamo/2022-06-27/data/listings.csv.gz",
"http://data.insideairbnb.com/germany/be/berlin/2022-06-13/data/listings.csv.gz",
"http://data.insideairbnb.com/italy/emilia-romagna/bologna/2022-06-11/data/listings.csv.gz",
"http://data.insideairbnb.com/france/nouvelle-aquitaine/bordeaux/2022-09-12/data/listings.csv.gz",
"http://data.insideairbnb.com/united-states/ma/boston/2022-06-13/data/listings.csv.gz",
"http://data.insideairbnb.com/united-kingdom/england/bristol/2022-06-20/data/listings.csv.gz",
"http://data.insideairbnb.com/united-states/fl/broward-county/2022-06-17/data/listings.csv.gz",
"http://data.insideairbnb.com/belgium/bru/brussels/2022-09-18/data/listings.csv.gz",
"http://data.insideairbnb.com/argentina/ciudad-autónoma-de-buenos-aires/buenos-aires/2022-06-22/data/listings.csv.gz",
"http://data.insideairbnb.com/united-states/ma/cambridge/2022-06-22/data/listings.csv.gz",
"http://data.insideairbnb.com/south-africa/wc/cape-town/2022-06-22/data/listings.csv.gz",
"http://data.insideairbnb.com/united-states/il/chicago/2022-09-14/data/listings.csv.gz",
"http://data.insideairbnb.com/united-states/nv/clark-county-nv/2022-06-13/data/listings.csv.gz",
"http://data.insideairbnb.com/united-states/oh/columbus/2022-06-20/data/listings.csv.gz",
"http://data.insideairbnb.com/denmark/hovedstaden/copenhagen/2022-06-24/data/listings.csv.gz",
"http://data.insideairbnb.com/greece/crete/crete/2022-06-22/data/listings.csv.gz",
"http://data.insideairbnb.com/united-states/tx/dallas/2022-09-14/data/listings.csv.gz",
"http://data.insideairbnb.com/united-states/co/denver/2022-06-27/data/listings.csv.gz",
"http://data.insideairbnb.com/ireland/leinster/dublin/2022-09-11/data/listings.csv.gz",
"http://data.insideairbnb.com/united-kingdom/scotland/edinburgh/2022-09-13/data/listings.csv.gz",
"http://data.insideairbnb.com/spain/pv/euskadi/2022-06-27/data/listings.csv.gz",
"http://data.insideairbnb.com/italy/toscana/florence/2022-06-11/data/listings.csv.gz",
"http://data.insideairbnb.com/united-states/tx/fort-worth/2022-06-08/data/listings.csv.gz",
"http://data.insideairbnb.com/switzerland/geneva/geneva/2022-06-22/data/listings.csv.gz",
"http://data.insideairbnb.com/belgium/vlg/ghent/2022-06-20/data/listings.csv.gz",
"http://data.insideairbnb.com/spain/catalonia/girona/2022-06-27/data/listings.csv.gz",
"http://data.insideairbnb.com/united-kingdom/england/greater-manchester/2022-06-17/data/listings.csv.gz",
"http://data.insideairbnb.com/united-states/hi/hawaii/2022-09-12/data/listings.csv.gz",
"http://data.insideairbnb.com/china/hk/hong-kong/2022-06-14/data/listings.csv.gz",
"http://data.insideairbnb.com/turkey/marmara/istanbul/2022-06-27/data/listings.csv.gz",
"http://data.insideairbnb.com/united-states/nj/jersey-city/2022-06-15/data/listings.csv.gz",
"http://data.insideairbnb.com/portugal/lisbon/lisbon/2022-09-13/data/listings.csv.gz",
"http://data.insideairbnb.com/united-kingdom/england/london/2022-09-10/data/listings.csv.gz",
"http://data.insideairbnb.com/united-states/ca/los-angeles/2022-09-09/data/listings.csv.gz",
"http://data.insideairbnb.com/france/auvergne-rhone-alpes/lyon/2022-06-08/data/listings.csv.gz",
"http://data.insideairbnb.com/spain/comunidad-de-madrid/madrid/2022-09-11/data/listings.csv.gz",
"http://data.insideairbnb.com/spain/andalucía/malaga/2022-06-29/data/listings.csv.gz",
"http://data.insideairbnb.com/spain/islas-baleares/mallorca/2022-09-13/data/listings.csv.gz",
"http://data.insideairbnb.com/australia/vic/melbourne/2022-09-09/data/listings.csv.gz",
"http://data.insideairbnb.com/spain/islas-baleares/menorca/2022-06-29/data/listings.csv.gz",
"http://data.insideairbnb.com/mexico/df/mexico-city/2022-06-21/data/listings.csv.gz",
"http://data.insideairbnb.com/australia/nsw/mid-north-coast/2022-09-10/data/listings.csv.gz",
"http://data.insideairbnb.com/italy/lombardy/milan/2022-09-14/data/listings.csv.gz",
"http://data.insideairbnb.com/canada/qc/montreal/2022-09-12/data/listings.csv.gz",
"http://data.insideairbnb.com/australia/vic/mornington-peninsula/2022-09-13/data/listings.csv.gz",
"http://data.insideairbnb.com/germany/bv/munich/2022-06-21/data/listings.csv.gz",
"http://data.insideairbnb.com/italy/campania/naples/2022-09-14/data/listings.csv.gz",
"http://data.insideairbnb.com/united-states/tn/nashville/2022-06-13/data/listings.csv.gz",
"http://data.insideairbnb.com/canada/nb/new-brunswick/2022-06-24/data/listings.csv.gz",
"http://data.insideairbnb.com/united-states/la/new-orleans/2022-09-09/data/listings.csv.gz",
"http://data.insideairbnb.com/united-states/ny/new-york-city/2022-09-07/data/listings.csv.gz",
"http://data.insideairbnb.com/united-states/nj/newark/2022-06-27/data/listings.csv.gz",
"http://data.insideairbnb.com/australia/nsw/northern-rivers/2022-09-14/data/listings.csv.gz",
"http://data.insideairbnb.com/united-states/ca/oakland/2022-09-18/data/listings.csv.gz",
"http://data.insideairbnb.com/norway/oslo/oslo/2022-06-25/data/listings.csv.gz",
"http://data.insideairbnb.com/united-states/ca/pacific-grove/2022-06-19/data/listings.csv.gz",
"http://data.insideairbnb.com/france/ile-de-france/paris/2022-06-06/data/listings.csv.gz",
"http://data.insideairbnb.com/france/pyrénées-atlantiques/pays-basque/2022-06-10/data/listings.csv.gz",
"http://data.insideairbnb.com/united-states/or/portland/2022-09-16/data/listings.csv.gz",
"http://data.insideairbnb.com/portugal/norte/porto/2022-09-13/data/listings.csv.gz",
"http://data.insideairbnb.com/czech-republic/prague/prague/2022-09-16/data/listings.csv.gz",
"http://data.insideairbnb.com/italy/puglia/puglia/2022-06-27/data/listings.csv.gz",
"http://data.insideairbnb.com/canada/qc/quebec-city/2022-09-10/data/listings.csv.gz",
"http://data.insideairbnb.com/united-states/ri/rhode-island/2022-06-29/data/listings.csv.gz",
"http://data.insideairbnb.com/latvia/riga/riga/2022-06-25/data/listings.csv.gz",
"http://data.insideairbnb.com/brazil/rj/rio-de-janeiro/2022-06-20/data/listings.csv.gz",
"http://data.insideairbnb.com/italy/lazio/rome/2022-09-11/data/listings.csv.gz",
"http://data.insideairbnb.com/the-netherlands/south-holland/rotterdam/2022-06-15/data/listings.csv.gz",
"http://data.insideairbnb.com/united-states/or/salem-or/2022-06-15/data/listings.csv.gz",
"http://data.insideairbnb.com/united-states/ca/san-diego/2022-09-18/data/listings.csv.gz",
"http://data.insideairbnb.com/united-states/ca/san-francisco/2022-09-07/data/listings.csv.gz",
"http://data.insideairbnb.com/united-states/ca/san-mateo-county/2022-06-15/data/listings.csv.gz",
"http://data.insideairbnb.com/united-states/ca/santa-clara-county/2022-06-15/data/listings.csv.gz",
"http://data.insideairbnb.com/united-states/ca/santa-cruz-county/2022-06-27/data/listings.csv.gz",
"http://data.insideairbnb.com/chile/rm/santiago/2022-06-22/data/listings.csv.gz",
"http://data.insideairbnb.com/united-states/wa/seattle/2022-09-18/data/listings.csv.gz",
"http://data.insideairbnb.com/spain/andalucía/sevilla/2022-06-27/data/listings.csv.gz",
"http://data.insideairbnb.com/china/shanghai/shanghai/2022-06-21/data/listings.csv.gz",
"http://data.insideairbnb.com/italy/sicilia/sicily/2022-06-25/data/listings.csv.gz",
"http://data.insideairbnb.com/singapore/sg/singapore/2022-06-22/data/listings.csv.gz",
"http://data.insideairbnb.com/greece/south-aegean/south-aegean/2022-09-16/data/listings.csv.gz",
"http://data.insideairbnb.com/sweden/stockholms-län/stockholm/2022-06-25/data/listings.csv.gz",
"http://data.insideairbnb.com/australia/nsw/sydney/2022-09-09/data/listings.csv.gz",
"http://data.insideairbnb.com/taiwan/northern-taiwan/taipei/2022-06-29/data/listings.csv.gz",
"http://data.insideairbnb.com/australia/tas/tasmania/2022-09-07/data/listings.csv.gz",
"http://data.insideairbnb.com/the-netherlands/south-holland/the-hague/2022-06-15/data/listings.csv.gz",
"http://data.insideairbnb.com/greece/central-macedonia/thessaloniki/2022-06-17/data/listings.csv.gz",
"http://data.insideairbnb.com/japan/kantō/tokyo/2022-06-24/data/listings.csv.gz",
"http://data.insideairbnb.com/canada/on/toronto/2022-09-07/data/listings.csv.gz",
"http://data.insideairbnb.com/italy/trentino-alto-adige-südtirol/trentino/2022-06-29/data/listings.csv.gz",
"http://data.insideairbnb.com/united-states/mn/twin-cities-msa/2022-09-16/data/listings.csv.gz",
"http://data.insideairbnb.com/spain/vc/valencia/2022-06-13/data/listings.csv.gz",
"http://data.insideairbnb.com/canada/bc/vancouver/2022-09-10/data/listings.csv.gz",
"http://data.insideairbnb.com/switzerland/vd/vaud/2022-09-07/data/listings.csv.gz",
"http://data.insideairbnb.com/italy/veneto/venice/2022-09-07/data/listings.csv.gz",
"http://data.insideairbnb.com/canada/bc/victoria/2022-06-24/data/listings.csv.gz",
"http://data.insideairbnb.com/austria/vienna/vienna/2022-09-11/data/listings.csv.gz",
"http://data.insideairbnb.com/united-states/dc/washington-dc/2022-09-14/data/listings.csv.gz",
"http://data.insideairbnb.com/australia/wa/western-australia/2022-06-17/data/listings.csv.gz",
"http://data.insideairbnb.com/canada/mb/winnipeg/2022-09-13/data/listings.csv.gz",
"http://data.insideairbnb.com/switzerland/zürich/zurich/2022-06-24/data/listings.csv.gz",
"http://data.insideairbnb.com/ireland/2021-12-23/data/listings.csv.gz",
"http://data.insideairbnb.com/malta/2021-12-28/data/listings.csv.gz",
"http://data.insideairbnb.com/new-zealand/2022-08-08/data/listings.csv.gz"
)), class = "data.frame", row.names = c(NA, -115L))
Hopefully the question and data is clear. Thanking you in advance!!!!
Rather than lapply
, consider the elementwise mapply
(or its list return wrapper Map
) to iterate through data files, City, and even Country columns to assign new corresponding column values. Also, consider file.path
(OS-agnostic file/folder concatenation method).
Below assumes posted dataset is Airbnb_listing_urls
to be filtered for EU countries given first code block only downloads data files from this continent.
EU_data_files <- list.files(path="cities", pattern=".csv")
listing_EU <- Airbnb_listing_urls |> subset(Country %in% countries_in_EU)
airbnb_dfs <- Map(
function(data_file, country, city)
read.csv(file.path("cities", data_file)) |> transform(country=country, city=city),
EU_data_files,
listing_EU$Country,
listing_EU$City
)
airbnb_df <- do.call(rbind, airbnb_dfs)
Alternatively, try the tidy counterpart: purrr::pmap
.
EU_data_files <- list.files(path="cities", pattern=".csv")
listing_EU <- Airbnb_listing_urls |> subset(Country %in% countries_in_EU)
airbnb_dfs <- purrr::pmap(
list(EU_data_files, listing_EU$Country, listing_EU$City),
function(data_file, country, city)
read_csv(file.path("cities", data_file)) %>% mutate(country=country, city=city)
)
airbnb_df <- bind_rows(airbnb_dfs)