Wordclouds with Japanese, Korean, and Chinese characters

load("../credentials/twitter-token.Rdata")
sw <- readLines("../data/stopwords-ja.txt")

filterStream("../data/japanese-tweets.json",
    track=sw, timeout=30, oauth=my_oauth)
# reading into R
tweets <- streamR::parseTweets("../data/japanese-tweets.json", simplify=TRUE)
## 176 tweets have been parsed.
library(quanteda)
## quanteda version 0.9.9.65
## Using 3 of 4 cores for parallel computing
## 
## Attaching package: 'quanteda'
## The following object is masked from 'package:utils':
## 
##     View
tw <- corpus(tweets$text)
twdfm <- dfm_select(dfm(tw, remove_punct = TRUE, verbose=TRUE, remove_url=TRUE),
                    min_nchar=2)
## Creating a dfm from a corpus ...
##    ... tokenizing texts
##    ... lowercasing
##    ... found 176 documents, 1,326 features
##    ... created a 176 x 1,326 sparse dfm
##    ... complete. 
## Elapsed time: 0.001 seconds.
topfeatures(twdfm, n=25)
##             rt           って           から           ない           さん 
##             86             35             26             22             21 
##           です           する           した       フォロー     プレゼント 
##             18             17             17             15             15 
##           ます           抽選       ローリー       ショコラ           締切 
##             15             14             14             14             12 
## @mayla_classic            amp           応募           完了           この 
##             11             11             11             11             11 
##       ちょっと           これ         可愛い           くだ           さい 
##             11             10             10             10             10

What doesn’t work:

textplot_wordcloud(twdfm, rot.per=0, scale=c(3, .75), max.words=100)

But this should now work:

pdf("wordcloud.pdf", family="Japan1")
textplot_wordcloud(twdfm, rot.per=0, scale=c(3, .75), max.words=100)
dev.off()
## quartz_off_screen 
##                 2

How to choose the family font? See ?postscriptFonts.

Dealing with Unicode headaches

Unicode text can take different forms. Here we’ll see some of the most common and how to avoid getting errors when we parse text scraped from the web. We’ll be using the stringi package for some of the code here.

# some text in German
de <- "Einbahnstraße"
# all good!
textplot_wordcloud(tokens(de))

# what if it looks like this? (Unicode characters)
de <- "Einbahnstra\u00dfe"
# as long as encoding is properly declared, all will be fine and
# we can switch back and forth
Encoding(de)  # this should be UTF-8
## [1] "UTF-8"
message(de)
## Einbahnstraße
Encoding(de) <- "latin1"
message(de)
## Einbahnstraße
Encoding(de) <- "UTF-8"
message(de)
## Einbahnstraße
# we can also use the stringi package
library(stringi)
stri_unescape_unicode("Einbahnstra\u00dfe")
## [1] "Einbahnstraße"
# what if it looks like this? (Unicode characters as HEX/bite codes)
# see: http://www.fileformat.info/info/unicode/char/00df/index.htm
de <- "Einbahnstra<c3><9f>e"
# this will not work:
stri_unescape_unicode(de)
## [1] "Einbahnstra<c3><9f>e"
# one solution from stack overflow:
# https://stackoverflow.com/questions/25468716/convert-byte-encoding-to-unicode
m <- gregexpr("<[0-9a-f]{2}>", de)
codes <- regmatches(de,m)
chars <- lapply(codes, function(x) {
    rawToChar(as.raw(strtoi(paste0("0x",substr(x,2,3)))), multiple=T)
})
regmatches(de,m) <- chars
de
## [1] "Einbahnstraße"
# what is happening here? We're replacing:
codes
## [[1]]
## [1] "<c3>" "<9f>"
# with:
chars
## [[1]]
## [1] "\xc3" "\x9f"
# switching to a different language...
# what if it looks like this?
example <- c(
  "SAD DA POMOGNU RJE<U+0160>AVANJE POLITI<U+010C>KE KRIZE", 
  "PROBLEME GURAJU POD TEPIH", 
  "ODAO PRIZNANJE DR<U+017D>AVI")
# different representation of Unicode characters, e.g.:
# http://www.fileformat.info/info/unicode/char/0160/index.htm

# Things get even more complicated...
# One solution here:
# https://stackoverflow.com/questions/28248457/gsub-in-r-with-unicode-replacement-give-different-results-under-windows-compared
# we're basically going to convert to regular Unicode characters that
# R will be able to parse

trueunicode.hack <- function(string){
    m <- gregexpr("<U\\+[0-9A-F]{4}>", string)
    if(-1==m[[1]][1])
        return(string)

    codes <- unlist(regmatches(string, m))
    replacements <- codes
    N <- length(codes)
    for(i in 1:N){
        replacements[i] <- intToUtf8(strtoi(paste0("0x", substring(codes[i], 4, 7))))
    }

    # if the string doesn't start with a unicode, the copy its initial part
    # until first occurrence of unicode
    if(1!=m[[1]][1]){
        y <- substring(string, 1, m[[1]][1]-1)
        y <- paste0(y, replacements[1])
    }else{
        y <- replacements[1]
    }

    # if more than 1 unicodes in the string
    if(1<N){
        for(i in 2:N){
            s <- gsub("<U\\+[0-9A-F]{4}>", replacements[i], 
                      substring(string, m[[1]][i-1]+8, m[[1]][i]+7))
            Encoding(s) <- "UTF-8"
            y <- paste0(y, s)
        }
    }

    # get the trailing contents, if any
    if( nchar(string)>(m[[1]][N]+8) )
        y <- paste0( y, substring(string, m[[1]][N]+8, nchar(string)) )
    y
}

trueunicode.hack(example[1])
## [1] "SAD DA POMOGNU RJEŠAVANJE POLITIČKE KRIZE"
trueunicode.hack(example[2])
## [1] "PROBLEME GURAJU POD TEPIH"
trueunicode.hack(example[3])
## [1] "ODAO PRIZNANJE DRŽAVI"
# and here's how we would convert back and forth...
# same text in Croatian
example <- "SAD DA POMOGNU RJEŠAVANJE POLITIČKE KRIZE"
Encoding(example) # UTF-8
## [1] "unknown"
# convert to ASCII and delete non-ASCII characters
iconv(example, "UTF-8", "ASCII", sub="")
## [1] "SAD DA POMOGNU RJEAVANJE POLITIKE KRIZE"
# convert to latin1 and substitute to byte characters
(lat <- iconv(example, "UTF-8", "latin1", sub="byte"))
## [1] "SAD DA POMOGNU RJE<c5><a0>AVANJE POLITI<c4><8c>KE KRIZE"
m <- gregexpr("<[0-9a-f]{2}>", lat)
codes <- regmatches(lat,m)
chars <- lapply(codes, function(x) {
    rawToChar(as.raw(strtoi(paste0("0x",substr(x,2,3)))), multiple=T)
})
regmatches(lat,m) <- chars
lat
## [1] "SAD DA POMOGNU RJEŠAVANJE POLITIČKE KRIZE"

And one final example…

library(corpus)
## Warning: package 'corpus' was built under R version 3.4.1
## 
## Attaching package: 'corpus'
## The following object is masked from 'package:quanteda':
## 
##     stopwords
example <- "\U0001F64C" # extended unicode character
utf8_print(example)
## [1] "🙌​"