Heutiger Testraketenstart - Stufe 2c
Tutorial:
UDPipe Introduction · NLP with R and UDPipe
Deutsches Skript
library(udpipe)
# Model beim ersten Mal downloaden
#udmodel <- udpipe_download_model(language = "german")
udmodel
x <- udpipe(x = "Ich bin auf eine Reise gegangen und habe mitgenommen: meinen Laptop, meine Sonnenbrille und gute Laune.",
object = udmodel)
x
y <- as.data.frame(x)
y
udmodel_deutsch <- udpipe_load_model(file = "german-gsd-ud-2.4-190531.udpipe")
s <- udpipe_annotate(udmodel_deutsch, x = "Ich bin auf eine Reise gegangen und habe mitgenommen: meinen Laptop, meine Sonnenbrille und gute Laune.")
s
txt <- c("Ich habe mich verfahren, können Sie mir sagen, wo sich der Dicke Whopper befindet? Ja, mein Herr",
"Es schreitet voran, es schreitet erstaunlich gut voran")
x <- udpipe_annotate(udmodel_deutsch, x = txt)
x <- as.data.frame(x)
str(x)
table(x$upos)
## Tokenization + finds sentences, does not execute POS tagging, nor lemmatization or dependency parsing
x <- udpipe_annotate(udmodel_deutsch, x = txt, tagger = "none", parser = "none")
x <- as.data.frame(x)
table(x$upos)
table(x$dep_rel)
## Tokenization + finds sentences, does POS tagging and lemmatization but does not execute dependency parsing
x <- udpipe_annotate(udmodel_deutsch, x = txt, tagger = "default", parser = "none")
x <- as.data.frame(x)
table(x$upos)
table(x$dep_rel)
## Tokenization + finds sentences and executes dependency parsing but does not do POS tagging nor lemmatization
x <- udpipe_annotate(udmodel_deutsch, x = txt, tagger = "none", parser = "default")
x <- as.data.frame(x)
table(x$upos)
table(x$dep_rel)
## Either put every token on a new line and use tokenizer: vertical
input <- list(doc1 = c("ich", "bin", "de", "gegangen", "verloren", "können", "du", "ich", "sag",
"wo", "de", "Dicker Whopper", "liegt", "?", "Ja", "Sir"),
doc2 = c("Es", "Fortschritte", "Fortschritte", " Es "," Fortschritte "," Erstaunlich "," Gut "," Fortschritte "))
txt <- sapply(input, FUN=function(x) paste(x, collapse = "\n"))
x <- udpipe_annotate(udmodel_deutsch, x = txt, tokenizer = "vertical")
x <- as.data.frame(x)
## Or put every token of each document in 1 string separated by a space and use tokenizer: horizontal
## Mark that if a token contains a space, you need to replace the space
## with the 'NO-BREAK SPACE' (U+00A0) character to make sure it is still considered as one token
txt <- sapply(input, FUN=function(x){
x <- gsub(" ", intToUtf8(160), x) ## replace space with no-break-space
paste(x, collapse = " ")
})
x <- udpipe_annotate(udmodel_deutsch, x = as.character(txt), tokenizer = "horizontal")
x <- as.data.frame(x)
x
## Some remarks
# Your text has to be in UTF-8 Encoding when you pass it to udpipe_annotate,
# if you don't have that Encoding use standard R facilities like iconv to
# convert it to UTF-8. You get also results in UTF-8 encoding back
dl <- udpipe_download_model(language = "sanskrit", udpipe_model_repo = "jwijffels/udpipe.models.ud.2.0")
udmodel_sanskrit <- udpipe_load_model(file = dl$file_model)
txt <- "ततः असौ प्राह क्षत्रियस्य तिस्रः भार्या धर्मम् भवन्ति तत् एषा कदाचिद् वैश्या सुता भविष्यति तत् अनुरागः ममास्याम् ततः रथकारः तस्य निश्चयम् विज्ञायावदत् वयस्य किम् अ धुना कर्तव्यम् कौलिकः आह किम् अहम् जानामि त्वयि मित्रे यत् अभिहितं मया ततः"
x <- udpipe_annotate(udmodel_sanskrit, x = txt)
Encoding(x$conllu)
x <- as.data.frame(x)
Dutch Script
library(udpipe)# Model beim ersten Mal downloaden
#udmodel <- udpipe_download_model(language = "dutch")
udmodel
x <- udpipe(x = "Ik ging op reis en ik nam mee: mijn laptop, mijn zonnebril en goed humeur.",
object = udmodel)
x
udmodel_dutch <- udpipe_load_model(file = "dutch-alpino-ud-2.4-190531.udpipe")
txt <- c("Ik ben de weg kwijt, kunt u me zeggen waar de Lange Wapper ligt? Jazeker meneer",
"Het gaat vooruit, het gaat verbazend goed vooruit")
x <- udpipe_annotate(udmodel_dutch, x = txt)
x <- as.data.frame(x)
str(x)
table(x$upos)
## Tokenization + finds sentences, does not execute POS tagging, nor lemmatization or dependency parsing
x <- udpipe_annotate(udmodel_dutch, x = txt, tagger = "none", parser = "none")
x <- as.data.frame(x)
table(x$upos)
table(x$dep_rel)
## Tokenization + finds sentences, does POS tagging and lemmatization but does not execute dependency parsing
x <- udpipe_annotate(udmodel_dutch, x = txt, tagger = "default", parser = "none")
x <- as.data.frame(x)
table(x$upos)
table(x$dep_rel)
## Tokenization + finds sentences and executes dependency parsing but does not do POS tagging nor lemmatization
x <- udpipe_annotate(udmodel_dutch, x = txt, tagger = "none", parser = "default")
x <- as.data.frame(x)
table(x$upos)
table(x$dep_rel)
## Either put every token on a new line and use tokenizer: vertical
input <- list(doc1 = c("Ik", "ben", "de", "weg", "kwijt", ",", "kunt", "u", "me", "zeggen",
"waar", "de", "Lange Wapper", "ligt", "?", "Jazeker", "meneer"),
doc2 = c("Het", "gaat", "vooruit", ",", "het", "gaat", "verbazend", "goed", "vooruit"))
txt <- sapply(input, FUN=function(x) paste(x, collapse = "\n"))
x <- udpipe_annotate(udmodel_dutch, x = txt, tokenizer = "vertical")
x <- as.data.frame(x)
## Or put every token of each document in 1 string separated by a space and use tokenizer: horizontal
## Mark that if a token contains a space, you need to replace the space
## with the 'NO-BREAK SPACE' (U+00A0) character to make sure it is still considered as one token
txt <- sapply(input, FUN=function(x){
x <- gsub(" ", intToUtf8(160), x) ## replace space with no-break-space
paste(x, collapse = " ")
})
x <- udpipe_annotate(udmodel_dutch, x = as.character(txt), tokenizer = "horizontal")
x <- as.data.frame(x)
## Some remarks
# Your text has to be in UTF-8 Encoding when you pass it to udpipe_annotate,
# if you don't have that Encoding use standard R facilities like iconv to
# convert it to UTF-8. You get also results in UTF-8 encoding back
dl <- udpipe_download_model(language = "sanskrit", udpipe_model_repo = "jwijffels/udpipe.models.ud.2.0")
udmodel_sanskrit <- udpipe_load_model(file = dl$file_model)
txt <- "ततः असौ प्राह क्षत्रियस्य तिस्रः भार्या धर्मम् भवन्ति तत् एषा कदाचिद् वैश्या सुता भविष्यति तत् अनुरागः ममास्याम् ततः रथकारः तस्य निश्चयम् विज्ञायावदत् वयस्य किम् अ धुना कर्तव्यम् कौलिकः आह किम् अहम् जानामि त्वयि मित्रे यत् अभिहितं मया ततः"
x <- udpipe_annotate(udmodel_sanskrit, x = txt)
Encoding(x$conllu)
x <- as.data.frame(x)
Schlussbemerkung:
Beide Skripte funzen, grade eben noch durchgetestet (Stand 08.12.2019).Erläuterungen, wie gehabt, im Originaltutorial zu finden, welches sich hier befindet:
UDPipe Introduction · NLP with R and UDPipe
Kommentare
Kommentar veröffentlichen