A word cloud is a collection of words. A word cloud is a simple yet powerful visual representation object for text processing. Text is more frequent and the more important it is, the bigger and bolder the word are shown.

Today I will try to make word cloud using titles of research articles. Dataset is "mantle xenolith" from PetDB.

geoscience earth science mantle word cloud text analysis

Package

library(tm)
library(SnowballC)
library(wordcloud)
library(RColorBrewer)
library(tidyverse)
library(dplyr)

Read the text file
data <- read.csv("petdb_mantle_xenolith_title.csv")

Screening the data 
Since there are too many data, I chose the articles published in LITHOS
data_new <- data %>% filter(Journal == "LITHOS")
text <- data_new[,2] #titles of articles

Load the data as a corpus
docs <- Corpus(VectorSource(text))

Inspect the content of the docment
#Inspect the content of the docment
inspect(docs)
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 177
## 
##   [1] A FORE-ARC SETTING OF THE GERF OPHIOLITE, EASTERN DESERT, EGYPT: EVIDENCE FROM MINERAL CHEMISTRY AND GEOCHEMISTRY OF ULTRAMAFITES                                                                                                
##   [2] SERPENTINIZATION AND DEHYDRATION IN THE UPPER MANTLE BENEATH FUERTEVENTURA (EASTERN CANARY ISLANDS): EVIDENCE FROM MANTLE XENOLITHS                                                                                              
##   [3] METASOMATISM IN LITHOSPHERIC MANTLE ROOTS: CONSTRAINTS FROM WHOLE-ROCK AND MINERAL CHEMICAL COMPOSITION OF DEFORMED PERIDOTITE XENOLITHS FROM KIMBERLITE PIPE UDACHNAYA                                                          
##   [4] GEOCHEMISTRY OF ECLOGITE XENOLITHS FROM THE UDACHNAYA KIMBERLITE PIPE: SECTION OF ANCIENT OCEANIC CRUST SAMPLED                                                                                                                  
##   [5] NATURE OF THE LITHOSPHERIC MANTLE BENEATH THE ARABIAN SHIELD AND GENESIS OF AL-SPINEL MICROPODS: EVIDENCE FROM THE MANTLE XENOLITHS OF HARRAT KISHB, WESTERN SAUDI ARABIA                                                        
##   [6] PERSISTENCE OF FERTILE AND HYDROUS LITHOSPHERIC MANTLE BENEATH THE NORTHWESTERN ETHIOPIAN PLATEAU: EVIDENCE FROM MODAL, TRACE ELEMENT AND SR-ND-HF ISOTOPIC COMPOSITIONS OF AMPHIBOLE-BEARING MANTLE XENOLITHS  
........                

Transformation is performed using tm_map() function to replace, for example, special characters from the text
#Transformation is performed using tm_map() function to replace, for example, special characters from the text.
toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
docs <- tm_map(docs, toSpace, "/")
docs <- tm_map(docs, toSpace, "@")
docs <- tm_map(docs, toSpace, "\\|")

Convert the text to lower case
# Convert the text to lower case
docs <- tm_map(docs, content_transformer(tolower))

Remove numbers
# Remove numbers
docs <- tm_map(docs, removeNumbers)

Remove english common stopwords
# Remove english common stopwords
docs <- tm_map(docs, removeWords, stopwords("english"))

Remove punctuations 
# Remove punctuations
docs <- tm_map(docs, removePunctuation)

# Remove your own stop word
# specify your stopwords as a character vector
#docs <- tm_map(docs, removeWords) 

Eliminate extra white spaces
# Eliminate extra white spaces
docs <- tm_map(docs, stripWhitespace)

Text stemming
I will make word clouds with/without doing text stemming
# Text stemming
docs_new <- tm_map(docs, stemDocument)


Build a term-document matrix
Document matrix is a table containing the frequency of the words. Column names are words and row names are documents.
dtm <- TermDocumentMatrix(docs)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
head(d, 30)
##                      word freq
## mantle             mantle  133
## xenoliths       xenoliths  104
## evidence         evidence   44
## peridotite     peridotite   42
## beneath           beneath   38
## lithospheric lithospheric   31
## china               china   27
## craton             craton   26
## metasomatism metasomatism   21
## geochemistry geochemistry   20
## evolution       evolution   19
## north               north   19
## central           central   18
## kimberlite     kimberlite   16
## petrology       petrology   16
## south               south   16
## implications implications   16
## constraints   constraints   15
## peridotites   peridotites   15
## geochemical   geochemical   15
## eastern           eastern   14
## isotopic         isotopic   14
## element           element   13
## spinel             spinel   13
## melt                 melt   12
## lithosphere   lithosphere   12
## upper               upper   11
## trace               trace   11
## processes       processes   11
## isotope           isotope   11

The result after text stemming
dtm <- TermDocumentMatrix(docs_new)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d_new <- data.frame(word = names(v),freq=v)
head(d_new, 30)
##                      word freq
## mantl               mantl  133
## xenolith         xenolith  107
## peridotit       peridotit   58
## evid                 evid   45
## lithospher     lithospher   43
## beneath           beneath   38
## metasomat       metasomat   34
## isotop             isotop   33
## craton             craton   32
## china               china   27
## kimberlit       kimberlit   26
## melt                 melt   23
## petrolog         petrolog   22
## geochemistri geochemistri   20
## evolut             evolut   19
## north               north   19
## composit         composit   18
## central           central   18
## element           element   17
## south               south   16
## implic             implic   16
## constraint     constraint   15
## geochem           geochem   15
## eastern           eastern   14
## spinel             spinel   14
## miner               miner   13
## upper               upper   11
## trace               trace   11
## process           process   11
## northern         northern   11

Visualization 1
set.seed(1234)
wordcloud(words = d$word, freq = d$freq, min.freq = 1,
          max.words=200, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))
Visualization 2 (color)
set.seed(1234)
wordcloud(words = d$word, freq = d$freq, min.freq = 1,
          max.words=100, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Spectral"))

Visualization 3-1-1 (color)
set.seed(1234)
wordcloud(words = d$word, freq = d$freq, min.freq = 1,
          max.words=100, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "RdYlBu"))
geoscience earth science mantle word cloud text analysis


Visualization 3-1-2 (word frequency)
d <- head(d, 50)
p <- ggplot(d, aes(x = freq, y=fct_reorder(word,freq), fill = ..x..)) + 
  geom_bar(stat = "identity")
p <- p + scale_fill_distiller(palette = "RdYlBu")
plot(p)
geoscience earth science mantle word cloud text analysis

Visualization 3-2-1 (color)
The result after text stemming
set.seed(1234)
wordcloud(words = d_new$word, freq = d$freq, min.freq = 1,
          max.words=100, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "RdYlBu"))


Visualization 3-2-2 (word frequency)
d_new <- head(d_new, 50)
p <- ggplot(d_new, aes(x = freq, y=fct_reorder(word,freq), fill = ..x..)) + 
  geom_bar(stat = "identity")
p <- p + scale_fill_distiller(palette = "RdYlBu")
plot(p)



References