mlinsights/_doc/examples/plot_traceable_ngrams_tfidf.py at 37347dcbfe5eb26abf87a5a42f2b73a78bccefe9 · sdpython/mlinsights · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
"""
Traceable n-grams with tf-idf
=============================

The notebook looks into the way n-grams are stored in
`CountVectorizer <https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html>`_
and
`TfidfVectorizer <https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html#sklearn.feature_extraction.text.TfidfVectorizer>`_
and how the current storage (<= 0.21) is ambiguous in some cases.

Example with CountVectorizer
----------------------------

scikit-learn version
~~~~~~~~~~~~~~~~~~~~
"""

import numpy
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from mlinsights.mlmodel.sklearn_text import (
    TraceableCountVectorizer,
    TraceableTfidfVectorizer,
)

corpus = numpy.array(
    [
        "This is the first document.",
        "This document is the second document.",
        "Is this the first document?",
        "",
    ]
).reshape((4,))

mod1 = CountVectorizer(ngram_range=(1, 2))
mod1.fit(corpus)
########################################
#

mod1.transform(corpus).todense()

########################################
#


mod1.vocabulary_

########################################
#


corpus = numpy.array(
    [
        "This is the first document.",
        "This document is the second document.",
        "Is this the first document?",
        "",
    ]
).reshape((4,))

########################################
#


mod2 = TraceableCountVectorizer(ngram_range=(1, 2))
mod2.fit(corpus)
########################################
#

mod2.transform(corpus).todense()

########################################
#

mod2.vocabulary_


######################################################################
# The new class does the exact same thing but keeps n-grams in a more
# explicit form. The original form as a string is sometimes ambiguous as
# next example shows.
#
# Funny example with TfidfVectorizer
# ----------------------------------
#
# scikit-learn version
# ~~~~~~~~~~~~~~~~~~~~


corpus = numpy.array(
    [
        "This is the first document.",
        "This document is the second document.",
        "Is this the first document?",
        "",
    ]
).reshape((4,))

########################################
#

mod1 = TfidfVectorizer(ngram_range=(1, 2), token_pattern="[a-zA-Z ]{1,4}")
mod1.fit(corpus)
########################################
#

mod1.transform(corpus).todense()

########################################
#

mod1.vocabulary_


######################################################################
# mlinsights version
# ~~~~~~~~~~~~~~~~~~


mod2 = TraceableTfidfVectorizer(ngram_range=(1, 2), token_pattern="[a-zA-Z ]{1,4}")
mod2.fit(corpus)
########################################
#

mod2.transform(corpus).todense()

########################################
#

mod2.vocabulary_


######################################################################
# As you can see, the original 30th n-grams ``'t is  the'`` is a little
# but ambiguous. It is in fact ``('t is', ' the')`` as the
# *TraceableTfidfVectorizer* lets you know. The original form could have
# been ``('t', 'is  the')``, ``('t is', '  the')``, ``('t is ', ' the')``,
# ``('t is  ', 'the')``, ``('t', 'is  ', 'the')``\ … The regular
# expression gives some insights but not some information which can be
# easily used to guess the right one.