Text preprocessing in a different language

With this options it is possible to make a preprocessing text analyis for english language

dflemma <- 
  spacy_parse(structure(df2$term, names = df2$id), lemma = TRUE, pos = FALSE) %>% 
  group_by(id = sub("(.+)-(.+)", "\\1", doc_id)) %>% 
  summarise(text = paste(lemma, collapse = " "))

myCorpus <- corpus(dflemma[["text"]], docnames = dflemma[["id"]])

mystopwords <- c("can")
myDfm <- myCorpus %>%
  tokens(remove_punct = TRUE, remove_numbers = TRUE, remove_symbols = TRUE)  %>%
  tokens_remove(pattern = c(stopwords(source = "smart"), mystopwords))  %>%
  dfm(verbose = FALSE)

How is it possible to make for german and greek language the removal of stopwords and stemming?

Solution

Both German and Greek are found in the stemming and stopword language lists, so both should be easy to apply in quanteda.

library("quanteda")
## Package version: 3.2.0.9000
## Unicode version: 13.0
## ICU version: 69.1
## Parallel computing: 8 of 8 threads used.
## See https://quanteda.io for tutorials and examples.

txt_german <- "Wie kann ich eine natürliche Sprachverarbeitung für Texte in anderen Sprachen durchführen?"
txt_greek <- "Πώς μπορώ να πραγματοποιήσω επεξεργασία φυσικής γλώσσας σε κείμενα σε άλλες γλώσσες;"

tokens(txt_german, remove_punct = TRUE) %>%
  tokens_remove(stopwords("de")) %>%
  tokens_wordstem(language = "de")
## Tokens consisting of 1 document.
## text1 :
## [1] "natur"           "Sprachverarbeit" "Text"            "Sprach"         
## [5] "durchfuhr"

tokens(txt_greek, remove_punct = TRUE) %>%
  tokens_remove(stopwords("de")) %>%
  tokens_wordstem(language = "de")
## Tokens consisting of 1 document.
## text1 :
##  [1] "Πώς"            "μπορώ"          "να"             "πραγματοποιήσω"
##  [5] "επεξεργασία"    "φυσικής"        "γλώσσας"        "σε"            
##  [9] "κείμενα"        "σε"             "άλλες"          "γλώσσες"