Auto-Generated Knowledge Graph - english
Code von hier:
Hier erst einmal in Englisch, ich bemühe mich um eine Transformation zu Deutsch (wahrscheinlich nächster Post):
# pip install wikipedia-api
# ggf: pip3 install --upgrade --user git+https://github.com/lucasdnd/Wikipedia.git
import wikipediaapi
import pandas as pd
def wiki_scrape(topic):
wiki_api = wikipediaapi.Wikipedia(language='en',
extract_format=wikipediaapi.ExtractFormat.WIKI)
page_name = wiki_api.page(topic)
if not page_name.exists():
print('page does not exist')
exit()
page_links = [topic] + list(page_name.links.keys())
working_links = [link for link in page_links
if wiki_api.page(link).exists()]
blacklist = ('Template', 'Help:', 'Category:', 'Portal:', 'Wikipedia:', 'Talk:')
sources = [link for link in working_links
if len(wiki_api.page(link).text) > 20
and not link.startswith(blacklist)]
wiki_data = {'topic':topic, 'page':sources}
wiki_data['text'] = [wiki_api.page(page).text for page in sources]
wiki_data['link'] = [wiki_api.page(page).fullurl for page in sources]
wiki_data['categories'] = [[y[9:] for y in
list(wiki_api.page(page).categories.keys())]
for page in sources]
print ('Wikipedia pages scraped:', len(sources))
wiki_scrape_df = pd.DataFrame(wiki_data)
return wiki_scrape_df
#wiki_data = wiki_scrape('Financial crisis of 2007–08')
wiki_data = wiki_scrape('2019–20 coronavirus pandemic')
wiki_data
import pandas as pd
import re
import spacy
import neuralcoref
nlp = spacy.load('en_core_web_lg')
neuralcoref.add_to_pipe(nlp)
#Normalerweise eingebaute Funktion
def filter_spans(spans):
# Filter a sequence of spans so they don't contain overlaps
# For spaCy 2.1.4+: this function is available as spacy.util.filter_spans()
get_sort_key = lambda span: (span.end - span.start, -span.start)
sorted_spans = sorted(spans, key=get_sort_key, reverse=True)
result = []
seen_tokens = set()
for span in sorted_spans:
# Check for end - 1 here because boundaries are inclusive
if span.start not in seen_tokens and span.end - 1 not in seen_tokens:
result.append(span)
seen_tokens.update(range(span.start, span.end))
result = sorted(result, key=lambda span: span.start)
return result
def entity_pairs(text, coref=True):
text = re.sub(r'\n+', '.', text) # replace multiple newlines with period
text = re.sub(r'\[\d+\]', ' ', text) # remove reference numbers
text = nlp(text)
if coref:
text = nlp(text._.coref_resolved) # resolve coreference clusters
sentences = [sent.string.strip() for sent in text.sents] # split text into sentences
ent_pairs = list()
for sent in sentences:
sent = nlp(sent)
spans = list(sent.ents) + list(sent.noun_chunks) # collect nodes
#spans = spacy.util.filter_spans(spans)
spans = filter_spans(spans)
with sent.retokenize() as retokenizer:
[retokenizer.merge(span) for span in spans]
dep = [token.dep_ for token in sent]
if (dep.count('obj')+dep.count('dobj'))==1 \
and (dep.count('subj')+dep.count('nsubj'))==1:
for token in sent:
if token.dep_ in ('obj', 'dobj'): # identify object nodes
subject = [w for w in token.head.lefts if w.dep_
in ('subj', 'nsubj')] # identify subject nodes
if subject:
subject = subject[0]
# identify relationship by root dependency
relation = [w for w in token.ancestors if w.dep_ == 'ROOT']
if relation:
relation = relation[0]
# add adposition or particle to relationship
if relation.nbor(1).pos_ in ('ADP', 'PART'):
relation = ' '.join((str(relation),
str(relation.nbor(1))))
else:
relation = 'unknown'
subject, subject_type = refine_ent(subject, sent)
token, object_type = refine_ent(token, sent)
ent_pairs.append([str(subject), str(relation), str(token),
str(subject_type), str(object_type)])
filtered_ent_pairs = [sublist for sublist in ent_pairs
if not any(str(x) == '' for x in sublist)]
pairs = pd.DataFrame(filtered_ent_pairs, columns=['subject',
'relation', 'object', 'subject_type',
'object_type'])
print('Entity pairs extracted:', str(len(filtered_ent_pairs)))
return pairs
def refine_ent(ent, sent):
unwanted_tokens = (
'PRON', # pronouns
'PART', # particle
'DET', # determiner
'SCONJ', # subordinating conjunction
'PUNCT', # punctuation
'SYM', # symbol
'X', # other
)
ent_type = ent.ent_type_ # get entity type
if ent_type == '':
ent_type = 'NOUN_CHUNK'
ent = ' '.join(str(t.text) for t in
nlp(str(ent)) if t.pos_
not in unwanted_tokens and t.is_stop == False)
elif ent_type in ('NOMINAL', 'CARDINAL', 'ORDINAL') and str(ent).find(' ') == -1:
t = ''
for i in range(len(sent) - ent.i):
if ent.nbor(i).pos_ not in ('VERB', 'PUNCT'):
t += ' ' + str(ent.nbor(i))
else:
ent = t.strip()
break
return ent, ent_type
pairs = entity_pairs(wiki_data.loc[0,'text'])
pairs
import networkx as nx
import matplotlib.pyplot as plt
def draw_kg(pairs):
k_graph = nx.from_pandas_edgelist(pairs, 'subject', 'object',
create_using=nx.MultiDiGraph())
node_deg = nx.degree(k_graph)
layout = nx.spring_layout(k_graph, k=0.15, iterations=20)
plt.figure(num=None, figsize=(120, 90), dpi=80)
nx.draw_networkx(
k_graph,
node_size=[int(deg[1]) * 500 for deg in node_deg],
arrowsize=20,
linewidths=1.5,
pos=layout,
edge_color='red',
edgecolors='black',
node_color='white',
)
labels = dict(zip(list(zip(pairs.subject, pairs.object)),
pairs['relation'].tolist()))
nx.draw_networkx_edge_labels(k_graph, pos=layout, edge_labels=labels,
font_color='red')
plt.axis('off')
plt.show()
draw_kg(pairs)
Auto-Generated Knowledge Graphs
https://towardsdatascience.com/auto-generated-knowledge-graphs-92ca99a81121Hier erst einmal in Englisch, ich bemühe mich um eine Transformation zu Deutsch (wahrscheinlich nächster Post):
Code
# pip install wikipedia-api
# ggf: pip3 install --upgrade --user git+https://github.com/lucasdnd/Wikipedia.git
import wikipediaapi
import pandas as pd
def wiki_scrape(topic):
wiki_api = wikipediaapi.Wikipedia(language='en',
extract_format=wikipediaapi.ExtractFormat.WIKI)
page_name = wiki_api.page(topic)
if not page_name.exists():
print('page does not exist')
exit()
page_links = [topic] + list(page_name.links.keys())
working_links = [link for link in page_links
if wiki_api.page(link).exists()]
blacklist = ('Template', 'Help:', 'Category:', 'Portal:', 'Wikipedia:', 'Talk:')
sources = [link for link in working_links
if len(wiki_api.page(link).text) > 20
and not link.startswith(blacklist)]
wiki_data = {'topic':topic, 'page':sources}
wiki_data['text'] = [wiki_api.page(page).text for page in sources]
wiki_data['link'] = [wiki_api.page(page).fullurl for page in sources]
wiki_data['categories'] = [[y[9:] for y in
list(wiki_api.page(page).categories.keys())]
for page in sources]
print ('Wikipedia pages scraped:', len(sources))
wiki_scrape_df = pd.DataFrame(wiki_data)
return wiki_scrape_df
#wiki_data = wiki_scrape('Financial crisis of 2007–08')
wiki_data = wiki_scrape('2019–20 coronavirus pandemic')
wiki_data
import pandas as pd
import re
import spacy
import neuralcoref
nlp = spacy.load('en_core_web_lg')
neuralcoref.add_to_pipe(nlp)
#Normalerweise eingebaute Funktion
def filter_spans(spans):
# Filter a sequence of spans so they don't contain overlaps
# For spaCy 2.1.4+: this function is available as spacy.util.filter_spans()
get_sort_key = lambda span: (span.end - span.start, -span.start)
sorted_spans = sorted(spans, key=get_sort_key, reverse=True)
result = []
seen_tokens = set()
for span in sorted_spans:
# Check for end - 1 here because boundaries are inclusive
if span.start not in seen_tokens and span.end - 1 not in seen_tokens:
result.append(span)
seen_tokens.update(range(span.start, span.end))
result = sorted(result, key=lambda span: span.start)
return result
def entity_pairs(text, coref=True):
text = re.sub(r'\n+', '.', text) # replace multiple newlines with period
text = re.sub(r'\[\d+\]', ' ', text) # remove reference numbers
text = nlp(text)
if coref:
text = nlp(text._.coref_resolved) # resolve coreference clusters
sentences = [sent.string.strip() for sent in text.sents] # split text into sentences
ent_pairs = list()
for sent in sentences:
sent = nlp(sent)
spans = list(sent.ents) + list(sent.noun_chunks) # collect nodes
#spans = spacy.util.filter_spans(spans)
spans = filter_spans(spans)
with sent.retokenize() as retokenizer:
[retokenizer.merge(span) for span in spans]
dep = [token.dep_ for token in sent]
if (dep.count('obj')+dep.count('dobj'))==1 \
and (dep.count('subj')+dep.count('nsubj'))==1:
for token in sent:
if token.dep_ in ('obj', 'dobj'): # identify object nodes
subject = [w for w in token.head.lefts if w.dep_
in ('subj', 'nsubj')] # identify subject nodes
if subject:
subject = subject[0]
# identify relationship by root dependency
relation = [w for w in token.ancestors if w.dep_ == 'ROOT']
if relation:
relation = relation[0]
# add adposition or particle to relationship
if relation.nbor(1).pos_ in ('ADP', 'PART'):
relation = ' '.join((str(relation),
str(relation.nbor(1))))
else:
relation = 'unknown'
subject, subject_type = refine_ent(subject, sent)
token, object_type = refine_ent(token, sent)
ent_pairs.append([str(subject), str(relation), str(token),
str(subject_type), str(object_type)])
filtered_ent_pairs = [sublist for sublist in ent_pairs
if not any(str(x) == '' for x in sublist)]
pairs = pd.DataFrame(filtered_ent_pairs, columns=['subject',
'relation', 'object', 'subject_type',
'object_type'])
print('Entity pairs extracted:', str(len(filtered_ent_pairs)))
return pairs
def refine_ent(ent, sent):
unwanted_tokens = (
'PRON', # pronouns
'PART', # particle
'DET', # determiner
'SCONJ', # subordinating conjunction
'PUNCT', # punctuation
'SYM', # symbol
'X', # other
)
ent_type = ent.ent_type_ # get entity type
if ent_type == '':
ent_type = 'NOUN_CHUNK'
ent = ' '.join(str(t.text) for t in
nlp(str(ent)) if t.pos_
not in unwanted_tokens and t.is_stop == False)
elif ent_type in ('NOMINAL', 'CARDINAL', 'ORDINAL') and str(ent).find(' ') == -1:
t = ''
for i in range(len(sent) - ent.i):
if ent.nbor(i).pos_ not in ('VERB', 'PUNCT'):
t += ' ' + str(ent.nbor(i))
else:
ent = t.strip()
break
return ent, ent_type
pairs = entity_pairs(wiki_data.loc[0,'text'])
pairs
import networkx as nx
import matplotlib.pyplot as plt
def draw_kg(pairs):
k_graph = nx.from_pandas_edgelist(pairs, 'subject', 'object',
create_using=nx.MultiDiGraph())
node_deg = nx.degree(k_graph)
layout = nx.spring_layout(k_graph, k=0.15, iterations=20)
plt.figure(num=None, figsize=(120, 90), dpi=80)
nx.draw_networkx(
k_graph,
node_size=[int(deg[1]) * 500 for deg in node_deg],
arrowsize=20,
linewidths=1.5,
pos=layout,
edge_color='red',
edgecolors='black',
node_color='white',
)
labels = dict(zip(list(zip(pairs.subject, pairs.object)),
pairs['relation'].tolist()))
nx.draw_networkx_edge_labels(k_graph, pos=layout, edge_labels=labels,
font_color='red')
plt.axis('off')
plt.show()
draw_kg(pairs)
Kommentare
Kommentar veröffentlichen