Mein holpriger Weg zur automatischen Textkategorisierung mit Perl-Modulen / Algorithmen - Teil II
Wow, ich hab' mal auf Anhieb was Schwieriges richtig verstanden!
Aus Näggl mit Köppn - Probelauf 1 (Hembelz Om(x))
Code
#!/usr/bin/perl
# tsquery_tsrank_2_tfidf.pl
use strict;
use warnings;
use DBI;
use ZugangsDaten_postgresql qw($DB_USER $DB_PASSWD);
use Encode qw(decode encode);
use Storable qw(store retrieve);
use Lingua::TFIDF;
use Lingua::TFIDF::WordSegmenter::SplitBySpace;
use feature qw(say);
# Variablen
our $dbh;
# Programm
print "\nHier meldet sich das Programm \"tsquery_tsrank_2_text.pl\": Huhu!\n\n";
### Connect
connect_db();
## Update
wordvector_update();
# Query-Eingabe
print "\nBitte die Query eingeben! (Muster: (overload | meltdown) & (autist | autismus | asperger) ) \n\n";
my $tsquery = <STDIN>;
chomp $tsquery;
# Limit-Eingabe
print "\nBitte das Limit (Anzahl der gerankten Funde) eingeben!\n\n";
my $limit= <STDIN>;
chomp $limit;
$limit = int($limit);
## Testabfrage
print "\nSoll ohne Google-Search-Links gesucht/geranked werden? (ja/n)\n\n";
my $googlesearch = <STDIN>;
chomp $googlesearch;
my ($ranks_ref, $links_ref, $wordvectors_ref, $texts_ref);
if
( $googlesearch eq "ja" ) { ($ranks_ref, $links_ref, $wordvectors_ref,
$texts_ref) = tsquery_nogooglesearch($tsquery,$limit) }
else { ($ranks_ref, $links_ref, $wordvectors_ref, $texts_ref) = tsquery($tsquery,$limit) }
### Disconnect
disconnect_db();
## Testabfrage-Fortsetzung
my %ranks = %$ranks_ref;
my %links = %$links_ref;
my %wordvectors = %$wordvectors_ref;
my %texts = %$texts_ref;
print "\nHier sind die Ergebnisse!\n\n";
print map { $_, "\n", $ranks{$_}, "\n", $links{$_}, "\n" } sort { $links{$b} cmp $links{$a} } keys %ranks;
print "\nFertig!\n\n";
## Token nach link_ids auslesen
my @documents;
connect_db();
foreach ( keys %links ) {
my $document = document_token_select($_);
push @documents, $document;
}
disconnect_db();
## TF-IDF berechnen
my $tf_idf_calc = Lingua::TFIDF->new(
# Use a word segmenter for japanese text.
word_segmenter => Lingua::TFIDF::WordSegmenter::SplitBySpace->new,
);
my $tf_idfs = $tf_idf_calc->tf_idf(documents => \@documents);
## TF-IDF ausgeben
for ( my $i = 0; $i < scalar @documents; $i++ ) {
# TF-IDF of word $_ in $_
foreach ( keys $tf_idfs->[$i] ) {
say "Say $_, doc$i: ", $tf_idfs->[$i]{$_}, "\n";
}
}
###########################################################
############### Subroutinen ####################
###########################################################
# Subroutinen
sub connect_db {
## Verbindung zur DB herstellen
$dbh = DBI->connect("DBI:Pg:dbname=links;host=localhost", "$DB_USER", "$DB_PASSWD");
}
sub disconnect_db {
$dbh->disconnect();
}
# tsqueries
sub tsquery {
print "\n\ntsquery läuft!\n\n";
my $tsquery = shift;
my $limit = shift;
my $ranks_query = $dbh->prepare(" SELECT link_id, wordvector, ts_rank_cd(wordvector, query) AS rank
FROM wordvectors, to_tsquery(?) query
WHERE query @@ wordvector
ORDER BY rank DESC
LIMIT ?;");
$ranks_query->execute($tsquery,$limit);
my $texts_query = $dbh->prepare("SELECT link_id, text FROM texts WHERE link_id = ?;");
my %ranks;
my %wordvectors;
my %texts;
while ( my @query_row = $ranks_query->fetchrow_array ) {
my $link_id = $query_row[0];
my $wordvector = $query_row[1];
my $rank = $query_row[2];
$texts_query->execute($link_id);
my $text = $texts_query->fetchrow;
print "Link: $link_id Rank: $rank\n";
$ranks{$link_id} = $rank;
$wordvectors{$link_id} = $wordvector;
$texts{$link_id} = $text;
}
my $links_query = $dbh->prepare(" SELECT link_id, link_name
FROM links
WHERE link_id = ?;");
my %links;
foreach ( keys %ranks ) {
$links_query->execute($_);
while ( my @query_row = $links_query->fetchrow_array ) {
my $link_id = $query_row[0];
my $link = $query_row[1];
$link = encode('UTF8', $link, Encode::FB_DEFAULT);
print "Link: $link_id\nURL: $link\n";
$links{$link_id} = $link;
}
}
return (\%ranks, \%links, \%wordvectors, \%texts)
}
# tsqueries_nogooglesearch
sub tsquery_nogooglesearch {
print "\n\ntsquery_nogooglesearch läuft!\n\n";
my $tsquery = shift;
my $limit = shift;
my $ranks_query = $dbh->prepare(" SELECT wordvectors.link_id,
links.link_name, wordvector, ts_rank_cd(wordvector, query) AS rank
FROM wordvectors, to_tsquery(?) query, links
WHERE query @@ wordvector
AND wordvectors.link_id = links.link_id
AND links.link_name NOT LIKE '%google.com/search%'
AND links.link_name NOT LIKE '%webcache.googleusercontent.com/search%'
ORDER BY rank DESC
LIMIT ?;");
$ranks_query->execute($tsquery,$limit);
my $texts_query = $dbh->prepare("SELECT link_id, text FROM texts WHERE link_id = ?;");
my %ranks;
my %wordvectors;
my %texts;
while ( my @query_row = $ranks_query->fetchrow_array ) {
my $link_id = $query_row[0];
my $wordvector = $query_row[1];
my $rank = $query_row[2];
$texts_query->execute($link_id);
my $text = $texts_query->fetchrow;
print "Link: $link_id Rank: $rank\n";
$ranks{$link_id} = $rank;
$wordvectors{$link_id} = $wordvector;
$texts{$link_id} = $text;
}
my $links_query = $dbh->prepare(" SELECT link_id, link_name
FROM links
WHERE link_id = ?;");
my %links;
foreach ( keys %ranks ) {
$links_query->execute($_);
while ( my @query_row = $links_query->fetchrow_array ) {
my $link_id = $query_row[0];
my $link = $query_row[1];
$link = encode('UTF8', $link, Encode::FB_DEFAULT);
print "Link: $link_id\nURL: $link\n";
$links{$link_id} = $link;
}
}
return (\%ranks, \%links, \%wordvectors, \%texts)
}
# wordvector-table-Update
sub wordvector_update {
my $links2wordvectors_upsert = $dbh->prepare("INSERT INTO
links2wordvectors (link_id) SELECT link_id FROM links ORDER BY link_id
ON CONFLICT (link_id) DO NOTHING;");
$links2wordvectors_upsert->execute();
my $link2do_query = $dbh->prepare("SELECT link_id, text FROM texts
WHERE link_id IN (SELECT link_id FROM links2wordvectors WHERE
wordvector_created = 0);");
$link2do_query->execute();
my $link2wordvector_upsert = $dbh->prepare("INSERT INTO wordvectors
(link_id, text) VALUES (?,?) ON CONFLICT (link_id) DO NOTHING;");
my $wordvectors_update = $dbh->prepare("UPDATE wordvectors SET wordvector = to_tsvector(text) WHERE link_id = ?;");
my $links2wordvectors_update = $dbh->prepare("UPDATE
links2wordvectors SET wordvector_created = ?, fdate = ? WHERE link_id =
?;");
while ( my ($linkid, $text) = $link2do_query->fetchrow() ) {
print "\n\nLink-ID: $linkid\n\n";
$link2wordvector_upsert->execute($linkid,$text);
$wordvectors_update->execute($linkid);
$links2wordvectors_update->execute('1',date(),$linkid);
}
}
# Subroutine Date
sub date {
my $year = (localtime((time)))[5] + 1900;
my $month = (localtime((time)))[4] + 1;
my $mday = (localtime((time)))[3];
my $date = $year.'-'.$month.'-'.$mday;
return $date;
}
# Dokument-Token auslesen
sub document_token_select {
my $link_id = shift;
my $document_token_select = $dbh->prepare("SELECT token FROM (SELECT
token(ts_debug(text)) FROM texts WHERE link_id = $link_id) AS token;");
$document_token_select->execute();
my @document_token;
while ( my $token = $document_token_select->fetchrow() ) {
if ( $token =~ /[a-zA-ZäöüÄÖÜß]+/ ) {
push @document_token, $token;
}
}
my $document_token_string = join ( " ", map { $_ } @document_token );
return $document_token_string
}
Output
...
keys on reference is experimental at tsquery_tsrank_2_tfidf.pl line 99.
...
Say braucht, doc10: 1.01160091167848
Say Stiefel, doc10: 2.39789527279837
Say irgendwelche, doc10: 0.451985123743057
Say Hey, doc10: 0.78845736036427
Say Klinik, doc10: 1.57691472072854
Say wiegt, doc10: 0.451985123743057
Say Umzug, doc10: 1.01160091167848
Say somit, doc10: 1.01160091167848
Say Nach, doc10: 0.606135803570315
Say paralysiert, doc10: 2.39789527279837
Say Frust, doc10: 1.70474809223843
Say bevorzugt, doc10: 1.70474809223843
Say gesagt, doc10: 3.1638958662014
Say nachdenken, doc10: 1.70474809223843
Say ALG, doc10: 4.79579054559674
Say aufgaben, doc10: 2.39789527279837
Say offen, doc10: 1.01160091167848
Say ausgepr�gter, doc10: 1.70474809223843
Say lange, doc10: 0.955361193355604
Say anstarren/stalken, doc10: 2.39789527279837
Say hasst, doc10: 1.70474809223843
Say kiga, doc10: 19.183162182387
Say herausfinden, doc10: 1.29928298413026
Say Dank, doc10: 0.78845736036427
Say eifers�chtig, doc10: 1.70474809223843
Say Muss, doc10: 2.59856596826052
Say Winter, doc10: 3.40949618447685
...
Da stört nur noch die Ausgabe ( Say braucht keinen Zeilenumbruch, Alter!! ) und vor Allem das UTF8-Prob. Aber für heute habe ich genug geleistet, kümmer' ich mich morgen drum.
...
Hab's doch noch gemacht:
Code
# Dokument-Token auslesen
sub document_token_select {
my $link_id = shift;
my $document_token_select = $dbh->prepare("SELECT token FROM (SELECT
token(ts_debug(text)) FROM texts WHERE link_id = $link_id) AS token;");
$document_token_select->execute();
my @document_token;
while ( my $token = $document_token_select->fetchrow() ) {
if ( $token =~ /[a-zA-ZäöüÄÖÜß]+/ ) {
push @document_token, $token;
}
}
my $document_token_string = join ( " ", map { $_ } @document_token );
eval{$document_token_string = encode('utf8', $document_token_string, Encode::FB_CROAK)};
return $document_token_string
}
Output
...
Say Erziehungsberechtigen, doc10: 1.70474809223843
Say überprüfen, doc10: 2.59856596826052
Say untersuchen, doc10: 0.78845736036427
Say Wirkstoff, doc10: 1.70474809223843
Say Eindruck, doc10: 1.01160091167848
Say vermuten, doc10: 2.59856596826052
Say Persönlichkeiten, doc10: 2.02320182335696
Say Anschaulichkeit, doc10: 1.70474809223843
Say Ungeschicktheit, doc10: 1.70474809223843
Say distanziert, doc10: 1.01160091167848
Say Home, doc10: 1.70474809223843
Say Bereitschaft, doc10: 1.70474809223843
Say gehen, doc10: 0.19062035960865
Say einzelne, doc10: 3.03480273503544
Say wenn, doc10: 0.57186107882595
Say Kommunikationsdefizite, doc10: 1.70474809223843
Say Sprachgebrauch, doc10: 1.70474809223843
Say handelt, doc10: 2.02320182335696
Say stehen, doc10: 0.451985123743057
Say sozial, doc10: 1.81840741071095
Say vorliest, doc10: 3.40949618447685
Say er, doc10: 0.285930539412975
Say Schönheit, doc10: 1.70474809223843
Say Asperger-Syndroms, doc10: 4.84908642856252
Say sollen, doc10: 0.606135803570315
Say also, doc10: 0.19062035960865
Say schließen, doc10: 3.03480273503544
Say konfrontiert, doc10: 1.70474809223843
Say gesicherten, doc10: 1.70474809223843
Say Sucht, doc10: 1.01160091167848
Say laufen, doc10: 0.78845736036427
Say Situationen, doc10: 3.1638958662014
Say Online-Informationen, doc10: 3.40949618447685
Say würde, doc10: 0.0953101798043249
Say aller, doc10: 0.78845736036427
Say Jörgensen, doc10: 1.70474809223843
Say Verhaltensmuster, doc10: 2.36537208109281
Say diesem, doc10: 0.200670695462151
Say milder, doc10: 1.70474809223843
Say Geschäftsmodell, doc10: 1.70474809223843
Say starre, doc10: 1.01160091167848
Say neben, doc10: 0.955361193355604
Say Aktuell, doc10: 1.70474809223843
Say erwidern, doc10: 1.70474809223843
Say anstrengend, doc10: 1.29928298413026
Say anwenden, doc10: 1.01160091167848
Say Autismus-Spektrum-Quotient, doc10: 1.70474809223843
Say ich, doc10: 0.0953101798043249
Say Ärztin, doc10: 6.8189923689537
Say mathematischen, doc10: 1.70474809223843
Say Empathiefähigkeit, doc10: 3.40949618447685
Say hinter, doc10: 3.94228680182135
Say Mathematik, doc10: 0.451985123743057
Say Intelligenz, doc10: 1.80794049497223
Say Lebensjahr, doc10: 1.91072238671121
Say Stand, doc10: 5.11424427671528
Say akkreditiert, doc10: 3.40949618447685
Say gemeint, doc10: 0.606135803570315
Say Entwicklungsstörung, doc10: 5.19713193652104
...
Und so verschwindet die Warnmeldung
keys on reference is experimental at tsquery_tsrank_2_tfidf.pl line 99.
:
## TF-IDF ausgeben
for ( my $i = 0; $i < scalar @documents; $i++ ) {
# TF-IDF of word $_ in $_
my $tfidf_ref = $tf_idfs->[$i];
my %tfidf = %$tfidf_ref;
foreach ( keys %tfidf ) {
say "Say $_, doc$i: ", $tf_idfs->[$i]{$_};
}
}
Kommentare
Kommentar veröffentlichen