A common issue working with KJC characters is actually getting to display those characters in plots. Here’s one example of how to fix that problem.
# reading into R
tweets <- streamR::parseTweets("../data/japanese-tweets.json", simplify=TRUE)
## 176 tweets have been parsed.
library(quanteda)
## quanteda version 0.9.9.65
## Using 3 of 4 cores for parallel computing
##
## Attaching package: 'quanteda'
## The following object is masked from 'package:utils':
##
## View
tw <- corpus(tweets$text)
twdfm <- dfm_select(dfm(tw, remove_punct = TRUE, verbose=TRUE, remove_url=TRUE),
min_nchar=2)
## Creating a dfm from a corpus ...
## ... tokenizing texts
## ... lowercasing
## ... found 176 documents, 1,326 features
## ... created a 176 x 1,326 sparse dfm
## ... complete.
## Elapsed time: 0.001 seconds.
topfeatures(twdfm, n=25)
## rt って から ない さん
## 86 35 26 22 21
## です する した フォロー プレゼント
## 18 17 17 15 15
## ます 抽選 ローリー ショコラ 締切
## 15 14 14 14 12
## @mayla_classic amp 応募 完了 この
## 11 11 11 11 11
## ちょっと これ 可愛い くだ さい
## 11 10 10 10 10
What doesn’t work:
textplot_wordcloud(twdfm, rot.per=0, scale=c(3, .75), max.words=100)
But this should now work:
pdf("wordcloud.pdf", family="Japan1")
textplot_wordcloud(twdfm, rot.per=0, scale=c(3, .75), max.words=100)
dev.off()
## quartz_off_screen
## 2
How to choose the family font? See ?postscriptFonts
.
Unicode text can take different forms. Here we’ll see some of the most common and how to avoid getting errors when we parse text scraped from the web. We’ll be using the stringi
package for some of the code here.
# some text in German
de <- "Einbahnstraße"
# all good!
textplot_wordcloud(tokens(de))
# what if it looks like this? (Unicode characters)
de <- "Einbahnstra\u00dfe"
# as long as encoding is properly declared, all will be fine and
# we can switch back and forth
Encoding(de) # this should be UTF-8
## [1] "UTF-8"
message(de)
## Einbahnstraße
Encoding(de) <- "latin1"
message(de)
## EinbahnstraÃe
Encoding(de) <- "UTF-8"
message(de)
## Einbahnstraße
# we can also use the stringi package
library(stringi)
stri_unescape_unicode("Einbahnstra\u00dfe")
## [1] "Einbahnstraße"
# what if it looks like this? (Unicode characters as HEX/bite codes)
# see: http://www.fileformat.info/info/unicode/char/00df/index.htm
de <- "Einbahnstra<c3><9f>e"
# this will not work:
stri_unescape_unicode(de)
## [1] "Einbahnstra<c3><9f>e"
# one solution from stack overflow:
# https://stackoverflow.com/questions/25468716/convert-byte-encoding-to-unicode
m <- gregexpr("<[0-9a-f]{2}>", de)
codes <- regmatches(de,m)
chars <- lapply(codes, function(x) {
rawToChar(as.raw(strtoi(paste0("0x",substr(x,2,3)))), multiple=T)
})
regmatches(de,m) <- chars
de
## [1] "Einbahnstraße"
# what is happening here? We're replacing:
codes
## [[1]]
## [1] "<c3>" "<9f>"
# with:
chars
## [[1]]
## [1] "\xc3" "\x9f"
# switching to a different language...
# what if it looks like this?
example <- c(
"SAD DA POMOGNU RJE<U+0160>AVANJE POLITI<U+010C>KE KRIZE",
"PROBLEME GURAJU POD TEPIH",
"ODAO PRIZNANJE DR<U+017D>AVI")
# different representation of Unicode characters, e.g.:
# http://www.fileformat.info/info/unicode/char/0160/index.htm
# Things get even more complicated...
# One solution here:
# https://stackoverflow.com/questions/28248457/gsub-in-r-with-unicode-replacement-give-different-results-under-windows-compared
# we're basically going to convert to regular Unicode characters that
# R will be able to parse
trueunicode.hack <- function(string){
m <- gregexpr("<U\\+[0-9A-F]{4}>", string)
if(-1==m[[1]][1])
return(string)
codes <- unlist(regmatches(string, m))
replacements <- codes
N <- length(codes)
for(i in 1:N){
replacements[i] <- intToUtf8(strtoi(paste0("0x", substring(codes[i], 4, 7))))
}
# if the string doesn't start with a unicode, the copy its initial part
# until first occurrence of unicode
if(1!=m[[1]][1]){
y <- substring(string, 1, m[[1]][1]-1)
y <- paste0(y, replacements[1])
}else{
y <- replacements[1]
}
# if more than 1 unicodes in the string
if(1<N){
for(i in 2:N){
s <- gsub("<U\\+[0-9A-F]{4}>", replacements[i],
substring(string, m[[1]][i-1]+8, m[[1]][i]+7))
Encoding(s) <- "UTF-8"
y <- paste0(y, s)
}
}
# get the trailing contents, if any
if( nchar(string)>(m[[1]][N]+8) )
y <- paste0( y, substring(string, m[[1]][N]+8, nchar(string)) )
y
}
trueunicode.hack(example[1])
## [1] "SAD DA POMOGNU RJEŠAVANJE POLITIČKE KRIZE"
trueunicode.hack(example[2])
## [1] "PROBLEME GURAJU POD TEPIH"
trueunicode.hack(example[3])
## [1] "ODAO PRIZNANJE DRŽAVI"
# and here's how we would convert back and forth...
# same text in Croatian
example <- "SAD DA POMOGNU RJEŠAVANJE POLITIČKE KRIZE"
Encoding(example) # UTF-8
## [1] "unknown"
# convert to ASCII and delete non-ASCII characters
iconv(example, "UTF-8", "ASCII", sub="")
## [1] "SAD DA POMOGNU RJEAVANJE POLITIKE KRIZE"
# convert to latin1 and substitute to byte characters
(lat <- iconv(example, "UTF-8", "latin1", sub="byte"))
## [1] "SAD DA POMOGNU RJE<c5><a0>AVANJE POLITI<c4><8c>KE KRIZE"
m <- gregexpr("<[0-9a-f]{2}>", lat)
codes <- regmatches(lat,m)
chars <- lapply(codes, function(x) {
rawToChar(as.raw(strtoi(paste0("0x",substr(x,2,3)))), multiple=T)
})
regmatches(lat,m) <- chars
lat
## [1] "SAD DA POMOGNU RJEŠAVANJE POLITIČKE KRIZE"
And one final example…
library(corpus)
## Warning: package 'corpus' was built under R version 3.4.1
##
## Attaching package: 'corpus'
## The following object is masked from 'package:quanteda':
##
## stopwords
example <- "\U0001F602 \U0001F64C \U0001F602" # extended unicode character
utf8_print(example)
## [1] "😂 🙌 😂"
# you can search for the unicode representations of all these characters online