-
Notifications
You must be signed in to change notification settings - Fork 1
/
utils.R
82 lines (67 loc) · 3.1 KB
/
utils.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
loadPackages <- function(filePath) {
conn <- file(filePath,open="r")
listOfPack <-trimws(readLines(conn))
close(conn)
remove(conn)
print("Loading all needed packages into the current workspace...")
for (i in which(! listOfPack %in% .packages())){
tryCatch(library(listOfPack[i],character.only = T), #loads all the libraries needed
error = function(e) {
install.packages(toString(listOfPack[i])); #if can't load a library, it'll be downloaded
library(listOfPack[i],character.only = T)
}
)
}
print("Done. All packages currently loaded and ready to be used.")
}
loadDatasets <- function(dataDir) {
colNames = c("Ateco", "Company", "Form", "Province", "Region", "Status", "TaxID", "Year",
"B", "E", "P", "R", "Infl", "GeoArea", "Size", "Growth")
tryCatch({
suppressWarnings(warning(remove(aidat)))
for (name in colNames) {
load(paste0(dataDir, name, ".RData"))
if (!exists("origData"))
origData<-eval(parse(text = name))
else
origData<-cbind(origData, eval(parse(text = name)))
remove(list=name)
}
print("Loading dataset from distinct column files.")
},
error = function(e) {
print("Loading original dataset from original file")
tryCatch( {
load(paste0(dataDir,"aidat.RData"))
origData<<-aidat
for (name in colNames ) {
assign(toString(name), aidat[name])
save(list=name, file = paste0("data/",name,".RData"))
remove(list=name)
}
},
error = function(e) {
stop("No aidat file neither distinct column files found to load datasets.")
})
},
finally = {
" Preprocessing steps: changing column types for Ateco(int), TaxID(num), year(int).
Removing TradingRegion and TradingProvince columns "
origData$Ateco <- as.integer(as.character(origData$Ateco))
origData$TaxID <- as.numeric(as.character(origData$TaxID))
origData$Year <- as.integer(as.character(origData$Year))
origData[which(names(origData) %in% c("TradingRegion","TradingProvince"))] <- NULL
origData<-applyInflation(origData)
origData<-addGeoArea(origData)
origData<-addSize(origData)
print("Getting sectors subsets")
aida<<-subset(origData, Year>2006 & Year<2016 & R>=0 & E>=0); remove(origData)
#manufacturing <- subset(aidat,Ateco>=101100 & Ateco<=332009 & Year>2006 & Year<2016) #subset containing all the manufacturing firms from original dataset
manufacturing <- subset(aida,Ateco>=101100 & Ateco<=332009 & !is.na(R) & !is.na(E)) # removing all the rows having missing values for Revenue or Employee and keeping just 2007-2015 years records
manufacturing <<- addSubsectorColumn(manufacturing)
#manufacturing <- arrange(manufacturing,TaxID,Year) #sort manufacturing firms by TaxID and by year
restaurants<<-subset(aida,Ateco>=550000 & Ateco<570000) #subset containing all the manufacturing firms from original dataset
media<<-subset(aida,Ateco>=580000 & Ateco<640000) #subset containing all the manufacturing firms from original dataset
})
}
loadPackages(paste(wdir,packagesFile,sep = ""))