Heutiger Testraketenstart

{ nicht mehr ganz so heutig, mehr ein heutig+7 ;-) }

Nachbauen des folgenden Kurses in RStudio auf einem Windows 10 Laptop:
Introducing udpipe for easy Natural Language Processing in R | DataScience+

Skript für deutsche Textanalyse

Sys.setlocale(category = "LC_ALL", locale = "German")

# Bei Bedarf installieren

#install.packages('udpipe')
#install.packages('dplyr')
#install.packages('ggplot2')
#install.packages('stringr')
#install.packages('lattice')

# Datensatz von der Uni Leipzig, mit Geany in brauchbares CSV-Format umgemodelt
# Übersicht über deutsche Korpi:
# https://www.linguistik.hu-berlin.de/de/institut/professuren/korpuslinguistik/links/korpora_links

# Packages für grundlegende Zusammenfassung laden

library(dplyr)
library(ggplot2)

# Daten aus Datensatz einlesen und gruppieren

news <- read.csv('C:/Users/Thorsten/Documents/udpipe-nlp/deu_news_2015_1M-sentences.csv', sep="\t", quote="")
news %>% group_by(anfangszeichen) %>% count() %>% arrange(desc(n))

# udpipe-Package (deutsch) und Vortrainiertes Model laden

library(udpipe)
#during first time model download execute the below line too
#model <- udpipe_download_model(language = "german")
udmodel_german <- udpipe_load_model(file = 'german-gsd-ud-2.4-190531.udpipe')

# Aufgrund der Datenbankgrösse einen Unterdatensatz auswählen
# Das Anfangszeichen "A" wird herausgefiltert

news_azx <- news %>% filter(anfangszeichen == "G")

# Alles daraus wird mithilfe des Models annotiert

s <- udpipe_annotate(udmodel_german, news_azx$nachrichtentext)
x <- data.frame(s)

# Library 'lattice' wird geladen + POS-Tags werden erstellt und angezeigt

library(lattice)
stats <- txt_freq(x$upos)
stats$key <- factor(stats$key, levels = rev(stats$key))
barchart(key ~ freq, data = stats, col = "yellow",
         main = "UPOS (Universal Parts of Speech)\n frequency of occurrence",
         xlab = "Freq")

# Die häufigsten Vorkommen

## NOUNs
stats <- subset(x, upos %in% c("NOUN"))
stats <- txt_freq(stats$token)
stats$key <- factor(stats$key, levels = rev(stats$key))
barchart(key ~ freq, data = head(stats, 20), col = "cadetblue",
         main = "Most occurring nouns", xlab = "Freq")

## ADJECTIVES
stats <- subset(x, upos %in% c("ADJ"))
stats <- txt_freq(stats$token)
stats$key <- factor(stats$key, levels = rev(stats$key))
barchart(key ~ freq, data = head(stats, 20), col = "purple",
         main = "Most occurring adjectives", xlab = "Freq")

## VERBS
stats <- subset(x, upos %in% c("VERB"))
stats <- txt_freq(stats$token)
stats$key <- factor(stats$key, levels = rev(stats$key))
barchart(key ~ freq, data = head(stats, 20), col = "gold",
         main = "Most occurring Verbs", xlab = "Freq")

# Automatischer Schlüsselwort-Extrakt mit RAKE

## Using RAKE
stats <- keywords_rake(x = x, term = "lemma", group = "doc_id",
                       relevant = x$upos %in% c("NOUN", "ADJ"))
stats$key <- factor(stats$keyword, levels = rev(stats$keyword))
barchart(key ~ rake, data = head(subset(stats, freq > 3), 20), col = "red",
         main = "Keywords identified by RAKE",
         xlab = "Rake")

# Top NOUN-VERB-Paare als Schlüsselwortpaare

## Using a sequence of POS tags (noun phrases / verb phrases)
x$phrase_tag <- as_phrasemachine(x$upos, type = "upos")
stats <- keywords_phrases(x = x$phrase_tag, term = tolower(x$token),
                          pattern = "(A|N)*N(P+D*(A|N)*N)*",
                          is_regex = TRUE, detailed = FALSE)
stats <- subset(stats, ngram > 1 & freq > 3)
stats$key <- factor(stats$keyword, levels = rev(stats$keyword))
barchart(key ~ freq, data = head(stats, 20), col = "magenta",
         main = "Keywords - simple noun phrases", xlab = "Frequency")

Output (letzte Grafik)

Skript für englische Textanalyse

Sys.setlocale(category = "LC_ALL", locale = "")

# Bei Bedarf installieren

#install.packages('udpipe')
#install.packages('dplyr')
#install.packages('ggplot2')
#install.packages('stringr')
#install.packages('lattice')

# Dataset bei Kaggle: https://www.kaggle.com/therohk/million-headlines

# Packages für grundlegende Zusammenfassung laden

library(dplyr)
library(ggplot2)

# Daten aus Datensatz einlesen und gruppieren

news <- read.csv('C:/Users/Thorsten/Documents/udpipe-nlp/abcnews-date-text.csv', header = T, stringsAsFactors = F)
news %>% group_by(publish_date) %>% count() %>% arrange(desc(n))

# Das Veröffentlichungsdatum nach Jahr, Monat, Tag aufsplitten
# und in neuen Datensatz mit drei neuen Tabellen abspeichern

news_more = news

library(stringr)
news_more = mutate(news_more,year = str_sub(publish_date,1,4))
news_more = mutate(news_more,month = str_sub(publish_date,5,6))
news_more = mutate(news_more, date = str_sub(publish_date,7,8))

news_more %>% group_by(year) %>% count() %>% ggplot() + geom_bar(aes(year,n), stat ='identity')

# udpipe-Package (englisch) und Vortrainiertes Model laden

library(udpipe)
#during first time model download execute the below line too
model <- udpipe_download_model(language = "english")
udmodel_english <- udpipe_load_model(file = 'english-ewt-ud-2.4-190531.udpipe')

# Aufgrund der Datenbankgrösse einen Unterdatensatz auswählen
# Das Jahr 2007 und der Monat November wird herausgefiltert

news_more_2007 <- news_more %>% filter(year == 2007 & month == 11)

# Alles daraus wird mithilfe des Models annotiert

s <- udpipe_annotate(udmodel_english, news_more_2007$headline_text)
x <- data.frame(s)

# Library 'lattice' wird geladen + POS-Tags werden erstellt und angezeigt

library(lattice)
stats <- txt_freq(x$upos)
stats$key <- factor(stats$key, levels = rev(stats$key))
barchart(key ~ freq, data = stats, col = "yellow",
         main = "UPOS (Universal Parts of Speech)\n frequency of occurrence",
         xlab = "Freq")

# Die häufigsten Vorkommen

## NOUNs
stats <- subset(x, upos %in% c("NOUN"))
stats <- txt_freq(stats$token)
stats$key <- factor(stats$key, levels = rev(stats$key))
barchart(key ~ freq, data = head(stats, 20), col = "cadetblue",
         main = "Most occurring nouns", xlab = "Freq")

## ADJECTIVES
stats <- subset(x, upos %in% c("ADJ"))
stats <- txt_freq(stats$token)
stats$key <- factor(stats$key, levels = rev(stats$key))
barchart(key ~ freq, data = head(stats, 20), col = "purple",
         main = "Most occurring adjectives", xlab = "Freq")

## VERBS
stats <- subset(x, upos %in% c("VERB"))
stats <- txt_freq(stats$token)
stats$key <- factor(stats$key, levels = rev(stats$key))
barchart(key ~ freq, data = head(stats, 20), col = "gold",
         main = "Most occurring Verbs", xlab = "Freq")

# Automatischer Schlüsselwort-Extrakt mit RAKE

## Using RAKE
stats <- keywords_rake(x = x, term = "lemma", group = "doc_id",
                       relevant = x$upos %in% c("NOUN", "ADJ"))
stats$key <- factor(stats$keyword, levels = rev(stats$keyword))
barchart(key ~ rake, data = head(subset(stats, freq > 3), 20), col = "red",
         main = "Keywords identified by RAKE",
         xlab = "Rake")

# Top NOUN-VERB-Paare als Schlüsselwortpaare

## Using a sequence of POS tags (noun phrases / verb phrases)
x$phrase_tag <- as_phrasemachine(x$upos, type = "upos")
stats <- keywords_phrases(x = x$phrase_tag, term = tolower(x$token),
                          pattern = "(A|N)*N(P+D*(A|N)*N)*",
                          is_regex = TRUE, detailed = FALSE)
stats <- subset(stats, ngram > 1 & freq > 3)
stats$key <- factor(stats$keyword, levels = rev(stats$keyword))
barchart(key ~ freq, data = head(stats, 20), col = "magenta",
         main = "Keywords - simple noun phrases", xlab = "Frequency")

Output (letzte Grafik)

Schlussbemerkung zur Stufe 2a

Beide Skripte sind Hempel-OMX-oXo-geprüft - sie funzen ohne Einschränkung (Stand 08.12.2019).
Was in den einzelnen Code-Zeilen passiert, lässt sich alles hier nachlesen: https://datascienceplus.com/introducing-udpipe-for-easy-natural-language-processing-in-r?utm_campaign=News&utm_medium=Community&utm_source=DataCamp.com

Für meine Zwecke ist es hochwichtig, dass alles auch mit deutschen Worten und Umlauten korrekt und möglichst gleichwertig wie im Englischen funktioniert. Zur Zeit ist alles noch überwiegend Eruierung. Zu anspruchsvolleren und passenderen Anwendungen werden wir mit der Zeit - und mit zunehmendem Überblick - dann noch kommen - versprochen!

Dieses Blog durchsuchen

Es gibt ...