Mein holpriger Weg zur automatischen Textkategorisierung mit Perl-Modulen / Algorithmen - Teil II


Wow, ich hab' mal auf Anhieb was Schwieriges richtig verstanden!

Aus Näggl mit Köppn - Probelauf 1 (Hembelz Om(x))

 

Code

 

#!/usr/bin/perl

# tsquery_tsrank_2_tfidf.pl

use strict;
use warnings;
use DBI;
use ZugangsDaten_postgresql qw($DB_USER $DB_PASSWD);
use Encode qw(decode encode);
use Storable qw(store retrieve);
use Lingua::TFIDF;
use Lingua::TFIDF::WordSegmenter::SplitBySpace;
use feature qw(say);


# Variablen

our $dbh;


# Programm

print "\nHier meldet sich das Programm \"tsquery_tsrank_2_text.pl\": Huhu!\n\n";

### Connect

connect_db();

## Update

wordvector_update();

# Query-Eingabe

print "\nBitte die Query eingeben! (Muster: (overload | meltdown) & (autist | autismus | asperger) ) \n\n";
my $tsquery = <STDIN>;
chomp $tsquery;

# Limit-Eingabe

print "\nBitte das Limit (Anzahl der gerankten Funde) eingeben!\n\n";
my $limit= <STDIN>;
chomp $limit;
$limit = int($limit);

## Testabfrage

print "\nSoll ohne Google-Search-Links gesucht/geranked werden? (ja/n)\n\n";
my $googlesearch = <STDIN>;
chomp $googlesearch;

my ($ranks_ref, $links_ref, $wordvectors_ref, $texts_ref);

if ( $googlesearch eq "ja" ) { ($ranks_ref, $links_ref, $wordvectors_ref, $texts_ref) = tsquery_nogooglesearch($tsquery,$limit) }
else { ($ranks_ref, $links_ref, $wordvectors_ref, $texts_ref) = tsquery($tsquery,$limit) }

### Disconnect

disconnect_db();

## Testabfrage-Fortsetzung

my %ranks = %$ranks_ref;
my %links = %$links_ref;
my %wordvectors = %$wordvectors_ref;
my %texts = %$texts_ref;

print "\nHier sind die Ergebnisse!\n\n";
print map { $_, "\n", $ranks{$_}, "\n", $links{$_}, "\n" } sort { $links{$b} cmp $links{$a} } keys %ranks;
print "\nFertig!\n\n";

## Token nach link_ids auslesen

my @documents;

connect_db();

foreach ( keys %links ) {
    my $document = document_token_select($_);
    push @documents, $document;
}

disconnect_db();

## TF-IDF berechnen

my $tf_idf_calc = Lingua::TFIDF->new(
  # Use a word segmenter for japanese text.
  word_segmenter => Lingua::TFIDF::WordSegmenter::SplitBySpace->new,
);

my $tf_idfs = $tf_idf_calc->tf_idf(documents => \@documents);


## TF-IDF ausgeben

for ( my $i = 0; $i < scalar @documents; $i++ ) {
    # TF-IDF of word $_ in $_
    foreach ( keys $tf_idfs->[$i] ) {
        say "Say $_, doc$i: ", $tf_idfs->[$i]{$_}, "\n";
    }
}




###########################################################
############### Subroutinen ####################
###########################################################

# Subroutinen

sub connect_db {
    ## Verbindung zur DB herstellen
    $dbh = DBI->connect("DBI:Pg:dbname=links;host=localhost", "$DB_USER", "$DB_PASSWD");
}

sub disconnect_db {
    $dbh->disconnect();
}

# tsqueries

sub tsquery {
    print "\n\ntsquery läuft!\n\n";
    my $tsquery = shift;
    my $limit = shift;
    my $ranks_query = $dbh->prepare("    SELECT link_id, wordvector, ts_rank_cd(wordvector, query) AS rank
                                        FROM wordvectors, to_tsquery(?) query
                                        WHERE query @@ wordvector
                                        ORDER BY rank DESC
                                        LIMIT ?;");
    $ranks_query->execute($tsquery,$limit);
    my $texts_query = $dbh->prepare("SELECT link_id, text FROM texts WHERE link_id = ?;");
    my %ranks;
    my %wordvectors;
    my %texts;
    while ( my @query_row = $ranks_query->fetchrow_array ) {
        my $link_id = $query_row[0];
        my $wordvector = $query_row[1];
        my $rank = $query_row[2];
        $texts_query->execute($link_id);
        my $text = $texts_query->fetchrow;
        print "Link: $link_id Rank: $rank\n";
        $ranks{$link_id} = $rank;
        $wordvectors{$link_id} = $wordvector;
        $texts{$link_id} = $text;
    }
    my $links_query = $dbh->prepare("    SELECT link_id, link_name
                                        FROM links
                                        WHERE link_id = ?;");
    my %links;
    foreach ( keys %ranks ) {
        $links_query->execute($_);
        while ( my @query_row = $links_query->fetchrow_array ) {
            my $link_id = $query_row[0];
            my $link = $query_row[1];
            $link = encode('UTF8', $link, Encode::FB_DEFAULT);
            print "Link: $link_id\nURL: $link\n";
            $links{$link_id} = $link;
        }
    }
    return (\%ranks, \%links, \%wordvectors, \%texts)
}

# tsqueries_nogooglesearch

sub tsquery_nogooglesearch {
    print "\n\ntsquery_nogooglesearch läuft!\n\n";
    my $tsquery = shift;
    my $limit = shift;
    my $ranks_query = $dbh->prepare("    SELECT wordvectors.link_id, links.link_name, wordvector, ts_rank_cd(wordvector, query) AS rank
                                        FROM wordvectors, to_tsquery(?) query, links
                                        WHERE query @@ wordvector
                                        AND wordvectors.link_id = links.link_id
                                        AND links.link_name NOT LIKE '%google.com/search%'
                                        AND links.link_name NOT LIKE '%webcache.googleusercontent.com/search%'
                                        ORDER BY rank DESC
                                        LIMIT ?;");
    $ranks_query->execute($tsquery,$limit);
    my $texts_query = $dbh->prepare("SELECT link_id, text FROM texts WHERE link_id = ?;");
    my %ranks;
    my %wordvectors;
    my %texts;
    while ( my @query_row = $ranks_query->fetchrow_array ) {
        my $link_id = $query_row[0];
        my $wordvector = $query_row[1];
        my $rank = $query_row[2];
        $texts_query->execute($link_id);
        my $text = $texts_query->fetchrow;
        print "Link: $link_id Rank: $rank\n";
        $ranks{$link_id} = $rank;
        $wordvectors{$link_id} = $wordvector;
        $texts{$link_id} = $text;
       
    }
    my $links_query = $dbh->prepare("    SELECT link_id, link_name
                                        FROM links
                                        WHERE link_id = ?;");
    my %links;
    foreach ( keys %ranks ) {
        $links_query->execute($_);
        while ( my @query_row = $links_query->fetchrow_array ) {
            my $link_id = $query_row[0];
            my $link = $query_row[1];
            $link = encode('UTF8', $link, Encode::FB_DEFAULT);
            print "Link: $link_id\nURL: $link\n";
            $links{$link_id} = $link;
        }
    }
    return (\%ranks, \%links, \%wordvectors, \%texts)
}

# wordvector-table-Update

sub wordvector_update {
    my $links2wordvectors_upsert = $dbh->prepare("INSERT INTO links2wordvectors (link_id) SELECT link_id FROM links ORDER BY link_id ON CONFLICT (link_id) DO NOTHING;");
    $links2wordvectors_upsert->execute();
    my $link2do_query = $dbh->prepare("SELECT link_id, text FROM texts WHERE link_id IN (SELECT link_id FROM links2wordvectors WHERE wordvector_created = 0);");
    $link2do_query->execute();
    my $link2wordvector_upsert = $dbh->prepare("INSERT INTO wordvectors (link_id, text) VALUES (?,?) ON CONFLICT (link_id) DO NOTHING;");
    my $wordvectors_update = $dbh->prepare("UPDATE wordvectors SET wordvector = to_tsvector(text) WHERE link_id = ?;");
    my $links2wordvectors_update = $dbh->prepare("UPDATE links2wordvectors SET wordvector_created = ?, fdate = ? WHERE link_id = ?;");
    while ( my ($linkid, $text) = $link2do_query->fetchrow() ) {
        print "\n\nLink-ID: $linkid\n\n";
        $link2wordvector_upsert->execute($linkid,$text);
        $wordvectors_update->execute($linkid);
        $links2wordvectors_update->execute('1',date(),$linkid);
    }
}

# Subroutine Date

sub date {
    my $year = (localtime((time)))[5] + 1900;
    my $month = (localtime((time)))[4] + 1;
    my $mday = (localtime((time)))[3];
    my $date = $year.'-'.$month.'-'.$mday;
    return $date;
}

# Dokument-Token auslesen

sub document_token_select {
    my $link_id = shift;
    my $document_token_select = $dbh->prepare("SELECT token FROM (SELECT token(ts_debug(text)) FROM texts WHERE link_id = $link_id) AS token;");
    $document_token_select->execute();
    my @document_token;
    while ( my $token = $document_token_select->fetchrow() ) {
        if ( $token =~ /[a-zA-ZäöüÄÖÜß]+/ ) {
            push @document_token, $token;
        }
    }
    my $document_token_string = join ( " ", map { $_ } @document_token );
    return $document_token_string
}

 

 

Output

 

...
keys on reference is experimental at tsquery_tsrank_2_tfidf.pl line 99.
...
Say braucht, doc10: 1.01160091167848

Say Stiefel, doc10: 2.39789527279837

Say irgendwelche, doc10: 0.451985123743057

Say Hey, doc10: 0.78845736036427

Say Klinik, doc10: 1.57691472072854

Say wiegt, doc10: 0.451985123743057

Say Umzug, doc10: 1.01160091167848

Say somit, doc10: 1.01160091167848

Say Nach, doc10: 0.606135803570315

Say paralysiert, doc10: 2.39789527279837

Say Frust, doc10: 1.70474809223843

Say bevorzugt, doc10: 1.70474809223843

Say gesagt, doc10: 3.1638958662014

Say nachdenken, doc10: 1.70474809223843

Say ALG, doc10: 4.79579054559674

Say aufgaben, doc10: 2.39789527279837

Say offen, doc10: 1.01160091167848

Say ausgepr�gter, doc10: 1.70474809223843

Say lange, doc10: 0.955361193355604

Say anstarren/stalken, doc10: 2.39789527279837

Say hasst, doc10: 1.70474809223843

Say kiga, doc10: 19.183162182387

Say herausfinden, doc10: 1.29928298413026

Say Dank, doc10: 0.78845736036427

Say eifers�chtig, doc10: 1.70474809223843

Say Muss, doc10: 2.59856596826052

Say Winter, doc10: 3.40949618447685
...

Da stört nur noch die Ausgabe ( Say braucht keinen Zeilenumbruch, Alter!! ) und vor Allem das UTF8-Prob. Aber für heute habe ich genug geleistet, kümmer' ich mich morgen drum.
...

Hab's doch noch gemacht:

 

Code

 

# Dokument-Token auslesen

sub document_token_select {
    my $link_id = shift;
    my $document_token_select = $dbh->prepare("SELECT token FROM (SELECT token(ts_debug(text)) FROM texts WHERE link_id = $link_id) AS token;");
    $document_token_select->execute();
    my @document_token;
    while ( my $token = $document_token_select->fetchrow() ) {
        if ( $token =~ /[a-zA-ZäöüÄÖÜß]+/ ) {
            push @document_token, $token;
        }
    }
    my $document_token_string = join ( " ", map { $_ } @document_token );
    eval{$document_token_string = encode('utf8', $document_token_string, Encode::FB_CROAK)};
    return $document_token_string
}

 

Output

 

...
Say Erziehungsberechtigen, doc10: 1.70474809223843
Say überprüfen, doc10: 2.59856596826052
Say untersuchen, doc10: 0.78845736036427
Say Wirkstoff, doc10: 1.70474809223843
Say Eindruck, doc10: 1.01160091167848
Say vermuten, doc10: 2.59856596826052
Say Persönlichkeiten, doc10: 2.02320182335696
Say Anschaulichkeit, doc10: 1.70474809223843
Say Ungeschicktheit, doc10: 1.70474809223843
Say distanziert, doc10: 1.01160091167848
Say Home, doc10: 1.70474809223843
Say Bereitschaft, doc10: 1.70474809223843
Say gehen, doc10: 0.19062035960865
Say einzelne, doc10: 3.03480273503544
Say wenn, doc10: 0.57186107882595
Say Kommunikationsdefizite, doc10: 1.70474809223843
Say Sprachgebrauch, doc10: 1.70474809223843
Say handelt, doc10: 2.02320182335696
Say stehen, doc10: 0.451985123743057
Say sozial, doc10: 1.81840741071095
Say vorliest, doc10: 3.40949618447685
Say er, doc10: 0.285930539412975
Say Schönheit, doc10: 1.70474809223843
Say Asperger-Syndroms, doc10: 4.84908642856252
Say sollen, doc10: 0.606135803570315
Say also, doc10: 0.19062035960865
Say schließen, doc10: 3.03480273503544
Say konfrontiert, doc10: 1.70474809223843
Say gesicherten, doc10: 1.70474809223843
Say Sucht, doc10: 1.01160091167848
Say laufen, doc10: 0.78845736036427
Say Situationen, doc10: 3.1638958662014
Say Online-Informationen, doc10: 3.40949618447685
Say würde, doc10: 0.0953101798043249
Say aller, doc10: 0.78845736036427
Say Jörgensen, doc10: 1.70474809223843
Say Verhaltensmuster, doc10: 2.36537208109281
Say diesem, doc10: 0.200670695462151
Say milder, doc10: 1.70474809223843
Say Geschäftsmodell, doc10: 1.70474809223843
Say starre, doc10: 1.01160091167848
Say neben, doc10: 0.955361193355604
Say Aktuell, doc10: 1.70474809223843
Say erwidern, doc10: 1.70474809223843
Say anstrengend, doc10: 1.29928298413026
Say anwenden, doc10: 1.01160091167848
Say Autismus-Spektrum-Quotient, doc10: 1.70474809223843
Say ich, doc10: 0.0953101798043249
Say Ärztin, doc10: 6.8189923689537
Say mathematischen, doc10: 1.70474809223843
Say Empathiefähigkeit, doc10: 3.40949618447685
Say hinter, doc10: 3.94228680182135
Say Mathematik, doc10: 0.451985123743057
Say Intelligenz, doc10: 1.80794049497223
Say Lebensjahr, doc10: 1.91072238671121
Say Stand, doc10: 5.11424427671528
Say akkreditiert, doc10: 3.40949618447685
Say gemeint, doc10: 0.606135803570315
Say Entwicklungsstörung, doc10: 5.19713193652104
...


 
Und so verschwindet die Warnmeldung
 
keys on reference is experimental at tsquery_tsrank_2_tfidf.pl line 99.
 
:


## TF-IDF ausgeben

for ( my $i = 0; $i < scalar @documents; $i++ ) {
    # TF-IDF of word $_ in $_
    my $tfidf_ref = $tf_idfs->[$i];
    my %tfidf = %$tfidf_ref;
    foreach ( keys %tfidf ) {
        say "Say $_, doc$i: ", $tf_idfs->[$i]{$_};
    }
}


Kommentare

Beliebte Posts aus diesem Blog

·

Es brennt.

Bye, bye Nord Stream 2!