Encoding issues

Wordclouds with Japanese, Korean, and Chinese characters

A common issue working with KJC characters is actually getting to display those characters in plots. Here’s one example of how to fix that problem.

# reading into R
tweets <- streamR::parseTweets("../data/japanese-tweets.json", simplify=TRUE)

## 176 tweets have been parsed.

library(quanteda)

## quanteda version 0.9.9.65

## Using 3 of 4 cores for parallel computing

## 
## Attaching package: 'quanteda'

## The following object is masked from 'package:utils':
## 
##     View

tw <- corpus(tweets$text)
twdfm <- dfm_select(dfm(tw, remove_punct = TRUE, verbose=TRUE, remove_url=TRUE),
                    min_nchar=2)

## Creating a dfm from a corpus ...

##    ... tokenizing texts

##    ... lowercasing

##    ... found 176 documents, 1,326 features

##    ... created a 176 x 1,326 sparse dfm
##    ... complete. 
## Elapsed time: 0.001 seconds.

topfeatures(twdfm, n=25)

##             rt           って           から           ない           さん 
##             86             35             26             22             21 
##           です           する           した       フォロー     プレゼント 
##             18             17             17             15             15 
##           ます           抽選       ローリー       ショコラ           締切 
##             15             14             14             14             12 
## @mayla_classic            amp           応募           完了           この 
##             11             11             11             11             11 
##       ちょっと           これ         可愛い           くだ           さい 
##             11             10             10             10             10

What doesn’t work:

textplot_wordcloud(twdfm, rot.per=0, scale=c(3, .75), max.words=100)

But this should now work:

pdf("wordcloud.pdf", family="Japan1")
textplot_wordcloud(twdfm, rot.per=0, scale=c(3, .75), max.words=100)
dev.off()

## quartz_off_screen 
##                 2

How to choose the family font? See ?postscriptFonts.

Dealing with Unicode headaches

Unicode text can take different forms. Here we’ll see some of the most common and how to avoid getting errors when we parse text scraped from the web. We’ll be using the stringi package for some of the code here.

# some text in German
de <- "Einbahnstraße"
# all good!
textplot_wordcloud(tokens(de))

# what if it looks like this? (Unicode characters)
de <- "Einbahnstra\u00dfe"
# as long as encoding is properly declared, all will be fine and
# we can switch back and forth
Encoding(de)  # this should be UTF-8

## [1] "UTF-8"

message(de)

## Einbahnstraße

Encoding(de) <- "latin1"
message(de)

## EinbahnstraÃe

Encoding(de) <- "UTF-8"
message(de)

## Einbahnstraße

# we can also use the stringi package
library(stringi)
stri_unescape_unicode("Einbahnstra\u00dfe")

## [1] "Einbahnstraße"

# what if it looks like this? (Unicode characters as HEX/bite codes)
# see: http://www.fileformat.info/info/unicode/char/00df/index.htm
de <- "Einbahnstra<c3><9f>e"
# this will not work:
stri_unescape_unicode(de)

## [1] "Einbahnstra<c3><9f>e"

# one solution from stack overflow:
# https://stackoverflow.com/questions/25468716/convert-byte-encoding-to-unicode
m <- gregexpr("<[0-9a-f]{2}>", de)
codes <- regmatches(de,m)
chars <- lapply(codes, function(x) {
    rawToChar(as.raw(strtoi(paste0("0x",substr(x,2,3)))), multiple=T)
})
regmatches(de,m) <- chars
de

## [1] "Einbahnstraße"

# what is happening here? We're replacing:
codes

## [[1]]
## [1] "<c3>" "<9f>"

# with:
chars

## [[1]]
## [1] "\xc3" "\x9f"

# switching to a different language...
# what if it looks like this?
example <- c(
  "SAD DA POMOGNU RJE<U+0160>AVANJE POLITI<U+010C>KE KRIZE", 
  "PROBLEME GURAJU POD TEPIH", 
  "ODAO PRIZNANJE DR<U+017D>AVI")
# different representation of Unicode characters, e.g.:
# http://www.fileformat.info/info/unicode/char/0160/index.htm

# Things get even more complicated...
# One solution here:
# https://stackoverflow.com/questions/28248457/gsub-in-r-with-unicode-replacement-give-different-results-under-windows-compared
# we're basically going to convert to regular Unicode characters that
# R will be able to parse

trueunicode.hack <- function(string){
    m <- gregexpr("<U\\+[0-9A-F]{4}>", string)
    if(-1==m[[1]][1])
        return(string)

    codes <- unlist(regmatches(string, m))
    replacements <- codes
    N <- length(codes)
    for(i in 1:N){
        replacements[i] <- intToUtf8(strtoi(paste0("0x", substring(codes[i], 4, 7))))
    }

    # if the string doesn't start with a unicode, the copy its initial part
    # until first occurrence of unicode
    if(1!=m[[1]][1]){
        y <- substring(string, 1, m[[1]][1]-1)
        y <- paste0(y, replacements[1])
    }else{
        y <- replacements[1]
    }

    # if more than 1 unicodes in the string
    if(1<N){
        for(i in 2:N){
            s <- gsub("<U\\+[0-9A-F]{4}>", replacements[i], 
                      substring(string, m[[1]][i-1]+8, m[[1]][i]+7))
            Encoding(s) <- "UTF-8"
            y <- paste0(y, s)
        }
    }

    # get the trailing contents, if any
    if( nchar(string)>(m[[1]][N]+8) )
        y <- paste0( y, substring(string, m[[1]][N]+8, nchar(string)) )
    y
}

trueunicode.hack(example[1])

## [1] "SAD DA POMOGNU RJEŠAVANJE POLITIČKE KRIZE"

trueunicode.hack(example[2])

## [1] "PROBLEME GURAJU POD TEPIH"

trueunicode.hack(example[3])

## [1] "ODAO PRIZNANJE DRŽAVI"

# and here's how we would convert back and forth...
# same text in Croatian
example <- "SAD DA POMOGNU RJEŠAVANJE POLITIČKE KRIZE"
Encoding(example) # UTF-8

## [1] "unknown"

# convert to ASCII and delete non-ASCII characters
iconv(example, "UTF-8", "ASCII", sub="")

## [1] "SAD DA POMOGNU RJEAVANJE POLITIKE KRIZE"

# convert to latin1 and substitute to byte characters
(lat <- iconv(example, "UTF-8", "latin1", sub="byte"))

## [1] "SAD DA POMOGNU RJE<c5><a0>AVANJE POLITI<c4><8c>KE KRIZE"

m <- gregexpr("<[0-9a-f]{2}>", lat)
codes <- regmatches(lat,m)
chars <- lapply(codes, function(x) {
    rawToChar(as.raw(strtoi(paste0("0x",substr(x,2,3)))), multiple=T)
})
regmatches(lat,m) <- chars
lat

## [1] "SAD DA POMOGNU RJEŠAVANJE POLITIČKE KRIZE"

And one final example…

library(corpus)

## Warning: package 'corpus' was built under R version 3.4.1

## 
## Attaching package: 'corpus'

## The following object is masked from 'package:quanteda':
## 
##     stopwords

example <- "\U0001F602 \U0001F64C \U0001F602" # extended unicode character
utf8_print(example)

## [1] "😂 🙌 😂"

# you can search for the unicode representations of all these characters online

Encoding issues

Pablo Barbera

September 26, 2017

Wordclouds with Japanese, Korean, and Chinese characters

Dealing with Unicode headaches