Heutiger Testraketenstart

Tutorial:

UDPipe Introduction · NLP with R and UDPipe

Deutsches Skript

library(udpipe)

# Model beim ersten Mal downloaden
#udmodel <- udpipe_download_model(language = "german")

udmodel

x <- udpipe(x = "Ich bin auf eine Reise gegangen und habe mitgenommen: meinen Laptop, meine Sonnenbrille und gute Laune.",
            object = udmodel)
x

y <- as.data.frame(x)
y

udmodel_deutsch <- udpipe_load_model(file = "german-gsd-ud-2.4-190531.udpipe")
s <- udpipe_annotate(udmodel_deutsch, x = "Ich bin auf eine Reise gegangen und habe mitgenommen: meinen Laptop, meine Sonnenbrille und gute Laune.")

s

txt <- c("Ich habe mich verfahren, können Sie mir sagen, wo sich der Dicke Whopper befindet? Ja, mein Herr",
         "Es schreitet voran, es schreitet erstaunlich gut voran")
x <- udpipe_annotate(udmodel_deutsch, x = txt)
x <- as.data.frame(x)
str(x)
table(x$upos)

## Tokenization + finds sentences, does not execute POS tagging, nor lemmatization or dependency parsing
x <- udpipe_annotate(udmodel_deutsch, x = txt, tagger = "none", parser = "none")
x <- as.data.frame(x)
table(x$upos)
table(x$dep_rel)

## Tokenization + finds sentences, does POS tagging and lemmatization but does not execute dependency parsing
x <- udpipe_annotate(udmodel_deutsch, x = txt, tagger = "default", parser = "none")
x <- as.data.frame(x)
table(x$upos)
table(x$dep_rel)

## Tokenization + finds sentences and executes dependency parsing but does not do POS tagging nor lemmatization
x <- udpipe_annotate(udmodel_deutsch, x = txt, tagger = "none", parser = "default")
x <- as.data.frame(x)
table(x$upos)
table(x$dep_rel)

## Either put every token on a new line and use tokenizer: vertical
input <- list(doc1 = c("ich", "bin", "de", "gegangen", "verloren", "können", "du", "ich", "sag",
                        "wo", "de", "Dicker Whopper", "liegt", "?", "Ja", "Sir"),
              doc2 = c("Es", "Fortschritte", "Fortschritte", " Es "," Fortschritte "," Erstaunlich "," Gut "," Fortschritte "))
txt <- sapply(input, FUN=function(x) paste(x, collapse = "\n"))
x <- udpipe_annotate(udmodel_deutsch, x = txt, tokenizer = "vertical")
x <- as.data.frame(x)

## Or put every token of each document in 1 string separated by a space and use tokenizer: horizontal
##   Mark that if a token contains a space, you need to replace the space
##   with the 'NO-BREAK SPACE' (U+00A0) character to make sure it is still considered as one token
txt <- sapply(input, FUN=function(x){
x <- gsub(" ", intToUtf8(160), x) ## replace space with no-break-space
paste(x, collapse = " ")
})
x <- udpipe_annotate(udmodel_deutsch, x = as.character(txt), tokenizer = "horizontal")

x <- as.data.frame(x)

x

## Some remarks

# Your text has to be in UTF-8 Encoding when you pass it to udpipe_annotate,
# if you don't have that Encoding use standard R facilities like iconv to
# convert it to UTF-8. You get also results in UTF-8 encoding back

dl <- udpipe_download_model(language = "sanskrit", udpipe_model_repo = "jwijffels/udpipe.models.ud.2.0")
udmodel_sanskrit <- udpipe_load_model(file = dl$file_model)
txt <- "ततः असौ प्राह क्षत्रियस्य तिस्रः भार्या धर्मम् भवन्ति तत् एषा कदाचिद् वैश्या सुता भविष्यति तत् अनुरागः ममास्याम् ततः रथकारः तस्य निश्चयम् विज्ञायावदत् वयस्य किम् अ धुना कर्तव्यम् कौलिकः आह किम् अहम् जानामि त्वयि मित्रे यत् अभिहितं मया ततः"
x <- udpipe_annotate(udmodel_sanskrit, x = txt)
Encoding(x$conllu)
x <- as.data.frame(x)

Dutch Script

library(udpipe)

# Model beim ersten Mal downloaden
#udmodel <- udpipe_download_model(language = "dutch")

udmodel

x <- udpipe(x = "Ik ging op reis en ik nam mee: mijn laptop, mijn zonnebril en goed humeur.",
            object = udmodel)
x

udmodel_dutch <- udpipe_load_model(file = "dutch-alpino-ud-2.4-190531.udpipe")
txt <- c("Ik ben de weg kwijt, kunt u me zeggen waar de Lange Wapper ligt? Jazeker meneer",
         "Het gaat vooruit, het gaat verbazend goed vooruit")
x <- udpipe_annotate(udmodel_dutch, x = txt)
x <- as.data.frame(x)
str(x)
table(x$upos)

## Tokenization + finds sentences, does not execute POS tagging, nor lemmatization or dependency parsing
x <- udpipe_annotate(udmodel_dutch, x = txt, tagger = "none", parser = "none")
x <- as.data.frame(x)
table(x$upos)
table(x$dep_rel)

## Tokenization + finds sentences, does POS tagging and lemmatization but does not execute dependency parsing
x <- udpipe_annotate(udmodel_dutch, x = txt, tagger = "default", parser = "none")
x <- as.data.frame(x)
table(x$upos)
table(x$dep_rel)

## Tokenization + finds sentences and executes dependency parsing but does not do POS tagging nor lemmatization
x <- udpipe_annotate(udmodel_dutch, x = txt, tagger = "none", parser = "default")
x <- as.data.frame(x)
table(x$upos)
table(x$dep_rel)

## Either put every token on a new line and use tokenizer: vertical
input <- list(doc1 = c("Ik", "ben", "de", "weg", "kwijt", ",", "kunt", "u", "me", "zeggen",
                       "waar", "de", "Lange Wapper", "ligt", "?", "Jazeker", "meneer"),
              doc2 = c("Het", "gaat", "vooruit", ",", "het", "gaat", "verbazend", "goed", "vooruit"))
txt <- sapply(input, FUN=function(x) paste(x, collapse = "\n"))
x <- udpipe_annotate(udmodel_dutch, x = txt, tokenizer = "vertical")
x <- as.data.frame(x)

## Or put every token of each document in 1 string separated by a space and use tokenizer: horizontal
##   Mark that if a token contains a space, you need to replace the space
##   with the 'NO-BREAK SPACE' (U+00A0) character to make sure it is still considered as one token
txt <- sapply(input, FUN=function(x){
x <- gsub(" ", intToUtf8(160), x) ## replace space with no-break-space
paste(x, collapse = " ")
})
x <- udpipe_annotate(udmodel_dutch, x = as.character(txt), tokenizer = "horizontal")
x <- as.data.frame(x)

## Some remarks

# Your text has to be in UTF-8 Encoding when you pass it to udpipe_annotate,
# if you don't have that Encoding use standard R facilities like iconv to
# convert it to UTF-8. You get also results in UTF-8 encoding back

dl <- udpipe_download_model(language = "sanskrit", udpipe_model_repo = "jwijffels/udpipe.models.ud.2.0")
udmodel_sanskrit <- udpipe_load_model(file = dl$file_model)
txt <- "ततः असौ प्राह क्षत्रियस्य तिस्रः भार्या धर्मम् भवन्ति तत् एषा कदाचिद् वैश्या सुता भविष्यति तत् अनुरागः ममास्याम् ततः रथकारः तस्य निश्चयम् विज्ञायावदत् वयस्य किम् अ धुना कर्तव्यम् कौलिकः आह किम् अहम् जानामि त्वयि मित्रे यत् अभिहितं मया ततः"
x <- udpipe_annotate(udmodel_sanskrit, x = txt)
Encoding(x$conllu)
x <- as.data.frame(x)

Schlussbemerkung:

Beide Skripte funzen, grade eben noch durchgetestet (Stand 08.12.2019).
Erläuterungen, wie gehabt, im Originaltutorial zu finden, welches sich hier befindet:
UDPipe Introduction · NLP with R and UDPipe

Dieses Blog durchsuchen

Es gibt ...