Two or more data frames can be combined into a single one according to the values of a single variables using the merge
function.
students <- data.frame(name = c("Paul", "Peter", "Amy", "Megan", "Selena"),
age = c(18, 19, 18, 20, 21),
stringsAsFactors=FALSE)
grades <- data.frame(name = c("Amy", "Megan", "Peter", "Drew"),
grade = c("A", "B+", "A-", "B"),
stringsAsFactors=FALSE)
merge(students, grades)
## name age grade
## 1 Amy 18 A
## 2 Megan 20 B+
## 3 Peter 19 A-
# we can be more specific about how to merge
merge(students, grades, by="name")
## name age grade
## 1 Amy 18 A
## 2 Megan 20 B+
## 3 Peter 19 A-
# if the common variable has different names, we need to specify it
grades <- data.frame(student = c("Amy", "Megan", "Peter", "Drew"),
grade = c("A", "B+", "A-", "B"),
stringsAsFactors=FALSE)
merge(students, grades, by.x="name", by.y="student")
## name age grade
## 1 Amy 18 A
## 2 Megan 20 B+
## 3 Peter 19 A-
Note that by default merge
will only return rows that are present in both dataset; but we can change that default behavior. For the non-matches, the returned value will be NA
.
merge(students, grades, by.x="name", by.y="student", all.x=TRUE)
## name age grade
## 1 Amy 18 A
## 2 Megan 20 B+
## 3 Paul 18 <NA>
## 4 Peter 19 A-
## 5 Selena 21 <NA>
merge(students, grades, by.x="name", by.y="student", all.y=TRUE)
## name age grade
## 1 Amy 18 A
## 2 Drew NA B
## 3 Megan 20 B+
## 4 Peter 19 A-
merge(students, grades, by.x="name", by.y="student", all=TRUE)
## name age grade
## 1 Amy 18 A
## 2 Drew NA B
## 3 Megan 20 B+
## 4 Paul 18 <NA>
## 5 Peter 19 A-
## 6 Selena 21 <NA>
What happens if names do not have exactly the same spelling? We can try some fuzzy string matching based on the Levenshtein (or edit) distance between strings. See ?adist
students <- data.frame(name = c("Paul", "Pete", "Amie", "Meghan", "Selena"),
age = c(18, 19, 18, 20, 21),
stringsAsFactors=FALSE)
grades <- data.frame(name = c("Amy", "Megan", "Peter", "Drew"),
grade = c("A", "B+", "A-", "B"),
stringsAsFactors=FALSE)
# probably there are better ways to code this...
students$grade <- NA
for (i in 1:nrow(students)){
dist <- adist(students$name[i], grades$name)
if (any(dist<4)){
students$grade[i] <- grades$grade[which.min(dist)]
}
}
students
## name age grade
## 1 Paul 18 <NA>
## 2 Pete 19 A-
## 3 Amie 18 A
## 4 Meghan 20 B+
## 5 Selena 21 <NA>
students <- data.frame(name = c("Paul", "Peter", "Amy", "Megan", "Selena"),
age = c(18, 19, 18, 20, 21),
stringsAsFactors=FALSE)
merge(students, grades)
## name age grade
## 1 Amy 18 A
## 2 Megan 20 B+
## 3 Peter 19 A-
To learn more about merging datasets based on strings, see the RecordLinkage
package.
Another package that is very useful is countrycode
, which contains a data frame with pretty much every potential country code you may want to use.
We often want to convert the format in which a data frame is structured. For example, we may to compute the average value of a variable, grouped by values of a different variable. Let’s see some examples using the aggregate
function:
d <- read.csv("~/data/unvoting.csv", stringsAsFactors=F)
aggregate(idealpoint ~ CountryAbb, data=d, FUN=mean)
## CountryAbb idealpoint
## 1 AAB -0.321975346
## 2 AFG -0.743184827
## 3 ALB -0.731741191
## 4 ALG -1.072331516
## 5 AND 0.997624350
## 6 ANG -0.834550724
## 7 ARG 0.466773197
## 8 ARM 0.263951907
## 9 AUL 1.439329514
## 10 AUS 0.972567741
## 11 AZE -0.054940797
## 12 BAH -0.652265926
## 13 BAR -0.025874241
## 14 BEL 1.643699421
## 15 BEN -0.424479143
## 16 BFO -0.405862817
## 17 BHM -0.016356751
## 18 BHU -0.455418992
## 19 BLR -1.485841528
## 20 BLZ -0.230150074
## 21 BNG -0.647943174
## 22 BOL 0.258725066
## 23 BOS 0.579010372
## 24 BOT -0.132484919
## 25 BRA 0.397884480
## 26 BRU -0.751028593
## 27 BUI -0.520690556
## 28 BUL -0.835299395
## 29 CAM -0.345161428
## 30 CAN 1.524966742
## 31 CAO -0.170294823
## 32 CAP -0.641121921
## 33 CDI 0.070070871
## 34 CEN -0.014981418
## 35 CHA -0.387022963
## 36 CHL 0.367165914
## 37 CHN -0.593210942
## 38 COL 0.329607122
## 39 COM -0.581682892
## 40 CON -0.690397326
## 41 COS 0.515998529
## 42 CRO 0.926010533
## 43 CUB -1.040910882
## 44 CYP 0.043858087
## 45 CZE -1.921071757
## 46 CZR 1.126187850
## 47 DEN 1.254143727
## 48 DJI -0.622261939
## 49 DMA -0.175044354
## 50 DOM 0.534634866
## 51 DRC -0.213268152
## 52 DRV -1.462438611
## 53 ECU 0.124674214
## 54 EGY -0.700777556
## 55 EQG -0.327079611
## 56 ERI -0.586986900
## 57 EST 1.036546141
## 58 ETH -0.322847861
## 59 ETM -0.223328744
## 60 FIJ 0.132465120
## 61 FIN 0.902689143
## 62 FRN 1.630735508
## 63 FSM 1.594445157
## 64 GAB -0.196512404
## 65 GAM -0.323640608
## 66 GDR -1.854022812
## 67 GFR 1.442225175
## 68 GHA -0.469053750
## 69 GNB -0.577730244
## 70 GRC 0.935106302
## 71 GRG 0.885555085
## 72 GRN -0.182420452
## 73 GUA 0.335027265
## 74 GUI -0.825244467
## 75 GUY -0.431085238
## 76 HAI 0.204360038
## 77 HON 0.464365140
## 78 HUN -0.840878786
## 79 ICE 1.185650061
## 80 IND -0.498628871
## 81 INS -0.690266980
## 82 IRE 0.999556659
## 83 IRN -0.491096541
## 84 IRQ -0.732378742
## 85 ISR 1.726464750
## 86 ITA 1.393039141
## 87 JAM -0.090941425
## 88 JOR -0.544759536
## 89 JPN 0.971563155
## 90 KEN -0.336768780
## 91 KUW -0.658986574
## 92 KYR -0.038621064
## 93 KZK 0.097375423
## 94 LAO -0.590769131
## 95 LAT 1.109589686
## 96 LBR 0.161144997
## 97 LEB -0.398888200
## 98 LES -0.114153349
## 99 LIB -1.033179969
## 100 LIE 0.931408461
## 101 LIT 1.057833086
## 102 LUX 1.608285812
## 103 MAA -0.679493777
## 104 MAC 0.904599025
## 105 MAD -0.405504657
## 106 MAG -0.391059818
## 107 MAL -0.335555410
## 108 MAS -0.238899042
## 109 MAW 0.304706031
## 110 MEX 0.028656130
## 111 MLD 0.895453676
## 112 MLI -0.790337969
## 113 MLT 0.463618342
## 114 MNC 1.008379994
## 115 MNG 0.882243757
## 116 MON -1.259744995
## 117 MOR -0.525026814
## 118 MSI 1.265935436
## 119 MYA -0.511901370
## 120 MZM -0.799510182
## 121 NAM -0.787633791
## 122 NAU 0.617491958
## 123 NEP -0.271337126
## 124 NEW 1.232127479
## 125 NIC 0.386579296
## 126 NIG -0.535306062
## 127 NIR -0.331949987
## 128 NOR 1.189414074
## 129 NTH 1.591873535
## 130 OMA -0.689087346
## 131 PAK -0.320544173
## 132 PAL 1.710688673
## 133 PAN 0.333613583
## 134 PAR 0.584912384
## 135 PER 0.332826508
## 136 PHI 0.034181571
## 137 PNG -0.028545065
## 138 POL -0.965032850
## 139 POR 1.338949529
## 140 PRK -1.610875636
## 141 QAT -0.730980538
## 142 ROK 0.560431086
## 143 ROM -0.490988956
## 144 RUS -1.344166127
## 145 RWA -0.189382820
## 146 SAF 1.137261876
## 147 SAL 0.419575436
## 148 SAU -0.476697677
## 149 SEN -0.363678566
## 150 SEY -0.749640085
## 151 SIE -0.359012516
## 152 SIN -0.147248444
## 153 SKN -0.276615269
## 154 SLO 1.055455225
## 155 SLU -0.317783939
## 156 SLV 1.017352086
## 157 SNM 0.833706802
## 158 SOL 0.045394903
## 159 SOM -0.592753912
## 160 SPN 1.019317475
## 161 SRI -0.542922454
## 162 SSUD 0.080758845
## 163 STP -0.638395777
## 164 SUD -0.968818091
## 165 SUR -0.379001547
## 166 SVG -0.177480856
## 167 SWA -0.015008537
## 168 SWD 1.051488491
## 169 SWZ 0.812609136
## 170 SYR -1.138487381
## 171 TAJ -0.003015838
## 172 TAW 0.980881104
## 173 TAZ -0.747727759
## 174 THI 0.112994255
## 175 TKM -0.403237598
## 176 TOG -0.317957308
## 177 TON 0.239468301
## 178 TRI -0.207891640
## 179 TUN -0.513942198
## 180 TUR 0.803360038
## 181 TUV 0.085368553
## 182 UAE -0.680056440
## 183 UGA -0.634884632
## 184 UKG 1.934657833
## 185 UKR -1.278563908
## 186 URU 0.440572999
## 187 USA 2.313139682
## 188 UZB 0.121101039
## 189 VAN -0.216416785
## 190 VEN 0.056115323
## 191 WSM 0.256462758
## 192 YAR -0.838385494
## 193 YPR -1.184582083
## 194 YUG -0.549893893
## 195 ZAM -0.587427769
## 196 ZAN -0.142002800
## 197 ZIM -0.917516027
aggregate(idealpoint ~ CountryAbb, data=d, FUN=median)
## CountryAbb idealpoint
## 1 AAB -0.316500800
## 2 AFG -0.708525450
## 3 ALB -1.375605000
## 4 ALG -1.078160000
## 5 AND 0.956633800
## 6 ANG -0.687650900
## 7 ARG 0.466451900
## 8 ARM 0.217508100
## 9 AUL 1.382966000
## 10 AUS 0.939241700
## 11 AZE -0.060395960
## 12 BAH -0.662515050
## 13 BAR -0.118212700
## 14 BEL 1.676815500
## 15 BEN -0.526670950
## 16 BFO -0.549375300
## 17 BHM -0.106256945
## 18 BHU -0.506064650
## 19 BLR -2.158284000
## 20 BLZ -0.313634400
## 21 BNG -0.791912400
## 22 BOL 0.268577300
## 23 BOS 0.744931750
## 24 BOT -0.316471000
## 25 BRA 0.077169865
## 26 BRU -0.857047700
## 27 BUI -0.528855200
## 28 BUL -1.811891000
## 29 CAM -0.268022050
## 30 CAN 1.557416000
## 31 CAO -0.202351950
## 32 CAP -0.626068100
## 33 CDI 0.132204070
## 34 CEN -0.034068890
## 35 CHA -0.327136700
## 36 CHL 0.400112800
## 37 CHN -0.718402750
## 38 COL 0.240228650
## 39 COM -0.612604550
## 40 CON -0.772654600
## 41 COS 0.461728450
## 42 CRO 0.939805500
## 43 CUB -1.526822000
## 44 CYP -0.064424400
## 45 CZE -2.293706500
## 46 CZR 1.103729500
## 47 DEN 1.231516500
## 48 DJI -0.703413050
## 49 DMA -0.294673200
## 50 DOM 0.535841150
## 51 DRC -0.100980830
## 52 DRV -1.456069000
## 53 ECU 0.008204963
## 54 EGY -0.771142950
## 55 EQG -0.293486250
## 56 ERI -0.626427100
## 57 EST 1.018306000
## 58 ETH -0.389460750
## 59 ETM -0.234853500
## 60 FIJ 0.084572990
## 61 FIN 0.917391950
## 62 FRN 1.584646500
## 63 FSM 1.871248000
## 64 GAB -0.269359100
## 65 GAM -0.381854700
## 66 GDR -2.262647000
## 67 GFR 1.429982000
## 68 GHA -0.541661500
## 69 GNB -0.586455850
## 70 GRC 0.886196050
## 71 GRG 0.827691150
## 72 GRN -0.271537300
## 73 GUA 0.318137400
## 74 GUI -0.769221200
## 75 GUY -0.487661600
## 76 HAI 0.253735900
## 77 HON 0.518668450
## 78 HUN -1.833337000
## 79 ICE 1.169448500
## 80 IND -0.483789750
## 81 INS -0.654845600
## 82 IRE 0.951812650
## 83 IRN -0.247687300
## 84 IRQ -0.890496200
## 85 ISR 2.051250000
## 86 ITA 1.496185000
## 87 JAM -0.168101850
## 88 JOR -0.562411300
## 89 JPN 1.020765000
## 90 KEN -0.360732600
## 91 KUW -0.621674700
## 92 KYR -0.010520990
## 93 KZK 0.141302400
## 94 LAO -0.875679600
## 95 LAT 1.077046000
## 96 LBR 0.217167700
## 97 LEB -0.350541150
## 98 LES -0.207490400
## 99 LIB -1.037750000
## 100 LIE 0.906396000
## 101 LIT 1.030051500
## 102 LUX 1.634739500
## 103 MAA -0.699618550
## 104 MAC 0.944997150
## 105 MAD -0.528512150
## 106 MAG -0.512079950
## 107 MAL -0.288990400
## 108 MAS -0.300473850
## 109 MAW 0.234206500
## 110 MEX -0.016497075
## 111 MLD 0.905109500
## 112 MLI -0.752641350
## 113 MLT 0.579622600
## 114 MNC 0.992084650
## 115 MNG 0.883088600
## 116 MON -1.417558000
## 117 MOR -0.615624950
## 118 MSI 1.251834000
## 119 MYA -0.337636650
## 120 MZM -0.736816150
## 121 NAM -0.772411700
## 122 NAU 0.442406050
## 123 NEP -0.283459800
## 124 NEW 1.179571500
## 125 NIC 0.142623700
## 126 NIG -0.516827900
## 127 NIR -0.297043100
## 128 NOR 1.188066000
## 129 NTH 1.575438000
## 130 OMA -0.676775800
## 131 PAK -0.362990500
## 132 PAL 1.984160000
## 133 PAN 0.230750700
## 134 PAR 0.638057550
## 135 PER -0.014807275
## 136 PHI -0.009607802
## 137 PNG -0.076111025
## 138 POL -1.833480000
## 139 POR 1.188712500
## 140 PRK -1.603360500
## 141 QAT -0.695668900
## 142 ROK 0.580691750
## 143 ROM -0.886191900
## 144 RUS -2.152497000
## 145 RWA -0.246268750
## 146 SAF 1.859164500
## 147 SAL 0.404142050
## 148 SAU -0.515755550
## 149 SEN -0.380929250
## 150 SEY -0.695410500
## 151 SIE -0.386932900
## 152 SIN -0.213463850
## 153 SKN -0.470962650
## 154 SLO 1.007696500
## 155 SLU -0.331839250
## 156 SLV 1.005981000
## 157 SNM 0.855217300
## 158 SOL 0.060753780
## 159 SOM -0.628680050
## 160 SPN 0.942399300
## 161 SRI -0.555506500
## 162 SSUD 0.080758845
## 163 STP -0.623581700
## 164 SUD -0.885570700
## 165 SUR -0.506624650
## 166 SVG -0.102258200
## 167 SWA -0.048791300
## 168 SWD 0.994247350
## 169 SWZ 0.770246500
## 170 SYR -1.337790000
## 171 TAJ -0.019060312
## 172 TAW 0.962063000
## 173 TAZ -0.733228000
## 174 THI -0.009497915
## 175 TKM -0.200468500
## 176 TOG -0.314558150
## 177 TON 0.229735300
## 178 TRI -0.288851350
## 179 TUN -0.418851500
## 180 TUR 0.641065250
## 181 TUV 0.010356520
## 182 UAE -0.697762150
## 183 UGA -0.650631000
## 184 UKG 1.988901500
## 185 UKR -2.130159000
## 186 URU 0.330888350
## 187 USA 2.355142000
## 188 UZB 0.215615600
## 189 VAN -0.141735900
## 190 VEN 0.046274315
## 191 WSM 0.232526200
## 192 YAR -0.850300900
## 193 YPR -1.228577000
## 194 YUG -0.673061800
## 195 ZAM -0.579048600
## 196 ZAN -0.142002800
## 197 ZIM -0.918304300
aggregate(PctAgreeRUSSIA ~ Year, data=d, FUN=mean)
## Year PctAgreeRUSSIA
## 1 1946 0.3616732
## 2 1947 0.3826490
## 3 1948 0.2794649
## 4 1949 0.3772119
## 5 1950 0.3118654
## 6 1951 0.4015296
## 7 1952 0.3453688
## 8 1953 0.5059253
## 9 1954 0.4809033
## 10 1955 0.5027162
## 11 1956 0.3456083
## 12 1957 0.4344091
## 13 1958 0.4835314
## 14 1959 0.4738003
## 15 1960 0.4495182
## 16 1961 0.4120274
## 17 1962 0.4813319
## 18 1963 0.4423542
## 19 1965 0.6094526
## 20 1966 0.5081736
## 21 1967 0.5817425
## 22 1968 0.4326419
## 23 1969 0.5363912
## 24 1970 0.4901029
## 25 1971 0.5663268
## 26 1972 0.5765383
## 27 1973 0.6005047
## 28 1974 0.6153276
## 29 1975 0.6141141
## 30 1976 0.6719926
## 31 1977 0.6115973
## 32 1978 0.5710776
## 33 1979 0.6055594
## 34 1980 0.6322449
## 35 1981 0.6877481
## 36 1982 0.6904781
## 37 1983 0.6992531
## 38 1984 0.7429096
## 39 1985 0.7234547
## 40 1986 0.7643367
## 41 1987 0.8523229
## 42 1988 0.8620272
## 43 1989 0.8720833
## 44 1990 0.8489548
## 45 1991 0.5881095
## 46 1992 0.5544948
## 47 1993 0.5172569
## 48 1994 0.5472591
## 49 1995 0.5871338
## 50 1996 0.6163929
## 51 1997 0.6536577
## 52 1998 0.6499320
## 53 1999 0.6654177
## 54 2000 0.6975194
## 55 2001 0.6940387
## 56 2002 0.7014127
## 57 2003 0.7217124
## 58 2004 0.7096238
## 59 2005 0.6933581
## 60 2006 0.7098116
## 61 2007 0.6857800
## 62 2008 0.6664526
## 63 2009 0.6492677
## 64 2010 0.6917664
## 65 2011 0.6604394
## 66 2012 0.6540915
plot(aggregate(PctAgreeRUSSIA ~ Year, data=d, FUN=mean), type="l")
aggregate(idealpoint ~ CountryAbb, data=d, FUN=length)
## CountryAbb idealpoint
## 1 AAB 32
## 2 AFG 66
## 3 ALB 56
## 4 ALG 50
## 5 AND 20
## 6 ANG 37
## 7 ARG 66
## 8 ARM 22
## 9 AUL 66
## 10 AUS 56
## 11 AZE 22
## 12 BAH 42
## 13 BAR 47
## 14 BEL 66
## 15 BEN 52
## 16 BFO 52
## 17 BHM 40
## 18 BHU 42
## 19 BLR 66
## 20 BLZ 32
## 21 BNG 39
## 22 BOL 66
## 23 BOS 20
## 24 BOT 47
## 25 BRA 66
## 26 BRU 29
## 27 BUI 49
## 28 BUL 56
## 29 CAM 52
## 30 CAN 66
## 31 CAO 52
## 32 CAP 38
## 33 CDI 52
## 34 CEN 45
## 35 CHA 51
## 36 CHL 66
## 37 CHN 42
## 38 COL 66
## 39 COM 38
## 40 CON 52
## 41 COS 66
## 42 CRO 21
## 43 CUB 66
## 44 CYP 52
## 45 CZE 46
## 46 CZR 20
## 47 DEN 66
## 48 DJI 36
## 49 DMA 31
## 50 DOM 64
## 51 DRC 52
## 52 DRV 36
## 53 ECU 66
## 54 EGY 66
## 55 EQG 40
## 56 ERI 20
## 57 EST 22
## 58 ETH 66
## 59 ETM 11
## 60 FIJ 43
## 61 FIN 56
## 62 FRN 66
## 63 FSM 21
## 64 GAB 52
## 65 GAM 48
## 66 GDR 17
## 67 GFR 40
## 68 GHA 56
## 69 GNB 36
## 70 GRC 66
## 71 GRG 20
## 72 GRN 39
## 73 GUA 66
## 74 GUI 54
## 75 GUY 47
## 76 HAI 66
## 77 HON 66
## 78 HUN 57
## 79 ICE 66
## 80 IND 66
## 81 INS 61
## 82 IRE 56
## 83 IRN 66
## 84 IRQ 57
## 85 ISR 64
## 86 ITA 56
## 87 JAM 50
## 88 JOR 57
## 89 JPN 56
## 90 KEN 49
## 91 KUW 49
## 92 KYR 19
## 93 KZK 21
## 94 LAO 56
## 95 LAT 22
## 96 LBR 57
## 97 LEB 66
## 98 LES 47
## 99 LIB 56
## 100 LIE 23
## 101 LIT 22
## 102 LUX 66
## 103 MAA 50
## 104 MAC 20
## 105 MAD 44
## 106 MAG 52
## 107 MAL 55
## 108 MAS 46
## 109 MAW 48
## 110 MEX 66
## 111 MLD 21
## 112 MLI 52
## 113 MLT 48
## 114 MNC 20
## 115 MNG 7
## 116 MON 51
## 117 MOR 56
## 118 MSI 22
## 119 MYA 64
## 120 MZM 38
## 121 NAM 23
## 122 NAU 26
## 123 NEP 56
## 124 NEW 66
## 125 NIC 66
## 126 NIG 52
## 127 NIR 49
## 128 NOR 66
## 129 NTH 66
## 130 OMA 42
## 131 PAK 65
## 132 PAL 15
## 133 PAN 66
## 134 PAR 66
## 135 PER 66
## 136 PHI 66
## 137 PNG 38
## 138 POL 66
## 139 POR 56
## 140 PRK 22
## 141 QAT 42
## 142 ROK 22
## 143 ROM 57
## 144 RUS 66
## 145 RWA 50
## 146 SAF 48
## 147 SAL 66
## 148 SAU 66
## 149 SEN 52
## 150 SEY 33
## 151 SIE 51
## 152 SIN 48
## 153 SKN 30
## 154 SLO 20
## 155 SLU 34
## 156 SLV 21
## 157 SNM 21
## 158 SOL 34
## 159 SOM 42
## 160 SPN 57
## 161 SRI 57
## 162 SSUD 2
## 163 STP 29
## 164 SUD 56
## 165 SUR 38
## 166 SVG 32
## 167 SWA 45
## 168 SWD 66
## 169 SWZ 11
## 170 SYR 63
## 171 TAJ 20
## 172 TAW 25
## 173 TAZ 51
## 174 THI 65
## 175 TKM 21
## 176 TOG 52
## 177 TON 14
## 178 TRI 50
## 179 TUN 56
## 180 TUR 66
## 181 TUV 12
## 182 UAE 42
## 183 UGA 50
## 184 UKG 66
## 185 UKR 66
## 186 URU 66
## 187 USA 66
## 188 UZB 18
## 189 VAN 31
## 190 VEN 66
## 191 WSM 36
## 192 YAR 65
## 193 YPR 23
## 194 YUG 59
## 195 ZAM 48
## 196 ZAN 1
## 197 ZIM 33
The other type of reshaping process is transforming a dataset from long (our preferred format!) to wide; and the other way around.
dd <- d[d$CountryAbb %in% c("HUN", "AUS") & d$Year %in% 2010:2012,
c("Year", "CountryAbb", "idealpoint")]
# converting to wide format
library(reshape)
(dc <- cast(dd, CountryAbb ~ Year, value="idealpoint"))
## CountryAbb 2010 2011 2012
## 1 AUS 0.7295488 0.7629837 0.7484664
## 2 HUN 1.0033000 0.9294886 1.0378380
# converting back to long format
melt(dc, id.vars=c("CountryAbb", "Year"))
## CountryAbb value Year
## X2010 AUS 0.7295488 2010
## X2010.1 HUN 1.0033000 2010
## X2011 AUS 0.7629837 2011
## X2011.1 HUN 0.9294886 2011
## X2012 AUS 0.7484664 2012
## X2012.1 HUN 1.0378380 2012
When collecting data from the web, we need to expect the best but always prepare for the worst. You don’t want to leave your computer scraping data overnight just to wake up and discover there was an error with one the websites you were scraping and no data was downloaded.
To deal with these issues, we can use some of the options that R offers to handle errors.
# this will give an error
for (i in 1:10){
message(i)
readLines("file.json")
}
We can use try
to ignore any error that the function may return.
# try, but not saving output of error
for (i in 1:10){
message(i)
try(readLines("file.json"))
}
## 1
## Warning in file(con, "r"): cannot open file 'file.json': No such file or
## directory
## 2
## Warning in file(con, "r"): cannot open file 'file.json': No such file or
## directory
## 3
## Warning in file(con, "r"): cannot open file 'file.json': No such file or
## directory
## 4
## Warning in file(con, "r"): cannot open file 'file.json': No such file or
## directory
## 5
## Warning in file(con, "r"): cannot open file 'file.json': No such file or
## directory
## 6
## Warning in file(con, "r"): cannot open file 'file.json': No such file or
## directory
## 7
## Warning in file(con, "r"): cannot open file 'file.json': No such file or
## directory
## 8
## Warning in file(con, "r"): cannot open file 'file.json': No such file or
## directory
## 9
## Warning in file(con, "r"): cannot open file 'file.json': No such file or
## directory
## 10
## Warning in file(con, "r"): cannot open file 'file.json': No such file or
## directory
But generally, tryCatch
is better because it will let us store the error message and react in response to it
for (i in 1:10){
message(i)
error <- tryCatch(readLines("file.json"), error=function(e) e)
if (grepl("Error in file", as.character(error))){
message("Error! Maybe the file does not exist?")
}
}
## 1
## Warning in file(con, "r"): cannot open file 'file.json': No such file or
## directory
## Error! Maybe the file does not exist?
## 2
## Warning in file(con, "r"): cannot open file 'file.json': No such file or
## directory
## Error! Maybe the file does not exist?
## 3
## Warning in file(con, "r"): cannot open file 'file.json': No such file or
## directory
## Error! Maybe the file does not exist?
## 4
## Warning in file(con, "r"): cannot open file 'file.json': No such file or
## directory
## Error! Maybe the file does not exist?
## 5
## Warning in file(con, "r"): cannot open file 'file.json': No such file or
## directory
## Error! Maybe the file does not exist?
## 6
## Warning in file(con, "r"): cannot open file 'file.json': No such file or
## directory
## Error! Maybe the file does not exist?
## 7
## Warning in file(con, "r"): cannot open file 'file.json': No such file or
## directory
## Error! Maybe the file does not exist?
## 8
## Warning in file(con, "r"): cannot open file 'file.json': No such file or
## directory
## Error! Maybe the file does not exist?
## 9
## Warning in file(con, "r"): cannot open file 'file.json': No such file or
## directory
## Error! Maybe the file does not exist?
## 10
## Warning in file(con, "r"): cannot open file 'file.json': No such file or
## directory
## Error! Maybe the file does not exist?
Let’s see it in the wild…
library(tweetscores)
accounts <- c("realDonaldTru", "HillaryClinton", "BernieSanders", "tedcruz")
load("~/my_oauth")
while (length(accounts) > 0){
# sample randomly one account to get tweets
new.user <- sample(as.character(accounts), 1)
message(new.user)
error <- tryCatch(friends <- getFriends(screen_name=new.user,
oauth=my_oauth,
verbose = TRUE), error=function(e) e)
if (inherits(error, 'error')) {
cat("Error! On to the next one...")
next
}
fname <- paste0("~/data/", new.user, '.rdata')
save(friends, file=fname)
accounts <- accounts[-which(accounts %in% new.user)]
}