Cosinus Ähnlichkeit, Mindmaps
Contents
Cosinus Ähnlichkeit, Mindmaps#
Autor: J.Busse, 6/2021, 2022-04-19
Lizenz: public domain / CC 0
Zur Weiterbearbeitung durch Studierende im Rahmen der LV dsci-txt
Dieses Programm zitieren:
Busse 2021-06-16: Cosinus Ähnlichkeit, Mindmaps. IPYNB-Notebook, April 2022
import numpy as np
import pandas as pd
Global Parameters#
# path to files, incl. glob mask
path_to_files = "mm/*.mm"
# show intermediary results
# 0 none, 1 informative, 2 debug
verbosity = 2
def verbose(level,item):
if level <= verbosity:
display(item)
Read Filenames#
# https://stackoverflow.com/questions/3207219/how-do-i-list-all-files-of-a-directory
import glob
files = glob.glob(path_to_files)
verbose(2,files)
[]
read mindmaps#
map einlesen, Liste von MAPS
import xml.etree.ElementTree as ET
from xml.etree.ElementTree import Element, SubElement, Comment, tostring, ElementTree
def walk_and_collect_dict(node, parent_text, resultdict):
"""walk mindmap, collect n-grams into resultdict"""
myText = node.get('TEXT')
# textAnalysiert = SpaCy.nlp(myText)
# basic bag of word (WOW) items: the terms themself
resultdict[ "A_" + myText ] = 1
# add n-gram to BOW, e.g. parent<-chlild
resultdict[ "B_" + parent_text + "|" + myText ] = 1
# add term plus time stamp of node creation to BOW
#resultdict[ "C_" + myText + "_" + node.get('CREATED') ] = 1
# add CREADTED to BOW
#resultdict[ "D_" + "CREATED_" + node.get('CREATED') ] = 1
# add MODIFIED to BOW
# resultdict[ "E_" + "MODIFIED_" + node.get('MODIFIED') ] = 1
for child in node.findall('node'):
walk_and_collect_dict(child, myText, resultdict)
def read_mm_files(files):
corpus = {}
# walk through all files
for file in files:
# verbose(3,file)
# load file as an XML element tree
with open(file) as file_ref:
verbose(2, "reading {}".format(file_ref))
# https://docs.python.org/2/library/xml.etree.elementtree.html#parsing-xml
# parse mindmap file
tree = ET.parse(file_ref)
# point root to xml root-element "/map"
root = tree.getroot()
tokens = {}
for n in root.findall('node/node'):
walk_and_collect_dict(n, "TOP", tokens)
corpus[file] = tokens
return corpus
corpus_dict = read_mm_files(files)
corpus_dict
{}
# https://www.geeksforgeeks.org/how-to-create-dataframe-from-dictionary-in-python-pandas/
# Method 6: Create DataFrame from nested Dictionary.
# nicht verändern
corpus_df = pd.DataFrame(corpus_dict).T.fillna(0)
# zeigen
corpus_df.T
# nur eine bestimmte Klasse von Spalten betrachten,
# hier: Alle Spalten, die mit 'C' beginnen
[ c for c in corpus_df.columns if c[0] == 'C']
[]
# falls man das tun will:
# den gesamten Korpus in ein Dictionary von Korpora aufteilen
corpus_df_dict = {}
typliste = ['A', 'B', 'C', 'D']
for t in typliste:
Auswahl = [ c for c in corpus_df.columns if c[0] == t]
print(t, Auswahl)
corpus_df_dict[t] = corpus_df[Auswahl]
A []
B []
C []
D []
for t in typliste:
display(corpus_df_dict[t].T)
# falls man das tun will: nur ausgewählte betrachten?
#corpus_df = corpus_df_dict['C', 'D']
#corpus_df
TfIdf#
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer(smooth_idf=False)
verbose(1,transformer)
TfidfTransformer(smooth_idf=False)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
TfidfTransformer(smooth_idf=False)
tfidf = transformer.fit_transform(corpus_df)
tfidf
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In [16], line 1
----> 1 tfidf = transformer.fit_transform(corpus_df)
2 tfidf
File ~/miniconda3/lib/python3.9/site-packages/sklearn/base.py:867, in TransformerMixin.fit_transform(self, X, y, **fit_params)
863 # non-optimized default implementation; override when a better
864 # method is possible for a given clustering algorithm
865 if y is None:
866 # fit method of arity 1 (unsupervised transformation)
--> 867 return self.fit(X, **fit_params).transform(X)
868 else:
869 # fit method of arity 2 (supervised transformation)
870 return self.fit(X, y, **fit_params).transform(X)
File ~/miniconda3/lib/python3.9/site-packages/sklearn/feature_extraction/text.py:1623, in TfidfTransformer.fit(self, X, y)
1605 """Learn the idf vector (global term weights).
1606
1607 Parameters
(...)
1618 Fitted transformer.
1619 """
1620 # large sparse data is not supported for 32bit platforms because
1621 # _document_frequency uses np.bincount which works on arrays of
1622 # dtype NPY_INTP which is int32 for 32bit platforms. See #20923
-> 1623 X = self._validate_data(
1624 X, accept_sparse=("csr", "csc"), accept_large_sparse=not _IS_32BIT
1625 )
1626 if not sp.issparse(X):
1627 X = sp.csr_matrix(X)
File ~/miniconda3/lib/python3.9/site-packages/sklearn/base.py:577, in BaseEstimator._validate_data(self, X, y, reset, validate_separately, **check_params)
575 raise ValueError("Validation should be done on X, y or both.")
576 elif not no_val_X and no_val_y:
--> 577 X = check_array(X, input_name="X", **check_params)
578 out = X
579 elif no_val_X and not no_val_y:
File ~/miniconda3/lib/python3.9/site-packages/sklearn/utils/validation.py:768, in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)
764 pandas_requires_conversion = any(
765 _pandas_dtype_needs_early_conversion(i) for i in dtypes_orig
766 )
767 if all(isinstance(dtype_iter, np.dtype) for dtype_iter in dtypes_orig):
--> 768 dtype_orig = np.result_type(*dtypes_orig)
770 if dtype_numeric:
771 if dtype_orig is not None and dtype_orig.kind == "O":
772 # if input is object, convert to float.
File <__array_function__ internals>:180, in result_type(*args, **kwargs)
ValueError: at least one array or dtype is required
verbose(1,pd.DataFrame(tfidf.toarray()))
Cosine Similarity#
interessant, ggf. auch noch ausprobieren?: https://stackoverflow.com/questions/17627219/whats-the-fastest-way-in-python-to-calculate-cosine-similarity-given-sparse-mat
Wir machen es hier eher low level, um unter die Motorhaube sehen zu können:
https://scikit-learn.org/stable/modules/metrics.html#cosine-similarity
didaktische Erklärung: http://blog.christianperone.com/2013/09/machine-learning-cosine-similarity-for-vector-space-models-part-iii/
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(tfidf)
verbose(2,similarity)
similarity_df = pd.DataFrame(similarity)
similarity_df.columns = files
similarity_df.index = files
similarity_df
import seaborn as sns
mask = np.zeros_like(similarity)
mask[np.triu_indices_from(mask)] = True
ax = sns.heatmap(similarity_df, mask=mask,annot= True , cmap = 'RdBu')
ax = sns.clustermap(similarity_df,annot= True , cmap = 'RdBu')
ax.savefig("clustermap.png")
https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html