import nltk
nltk.download('popular')

text = "Linux systems are used by less than 2% of people while 55% of developpers are using Linux. Maybe people should use it more often ;-) "

text = text.lower()
text

'linux systems are used by less than 2% of people while 55% of developpers are using linux. maybe people should use it more often ;-) '

text = ''.join(word for word in text if not word.isdigit())
text

'linux systems are used by less than % of people while % of developpers are using linux. maybe people should use it more often ;-) '

import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

for punctuation in string.punctuation:
    text = text.replace(punctuation, '') 

text

'linux systems are used by less than  of people while  of developpers are using linux maybe people should use it more often  '

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english')) 
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 're',
 's',
 'same',
 'shan',
 "shan't",
 'she',
 "she's",
 'should',
 "should've",
 'shouldn',
 "shouldn't",
 'so',
 'some',
 'such',
 't',
 'than',
 'that',
 "that'll",
 'the',
 'their',
 'theirs',
 'them',
 'themselves',
 'then',
 'there',
 'these',
 'they',
 'this',
 'those',
 'through',
 'to',
 'too',
 'under',
 'until',
 'up',
 've',
 'very',
 'was',
 'wasn',
 "wasn't",
 'we',
 'were',
 'weren',
 "weren't",
 'what',
 'when',
 'where',
 'which',
 'while',
 'who',
 'whom',
 'why',
 'will',
 'with',
 'won',
 "won't",
 'wouldn',
 "wouldn't",
 'y',
 'you',
 "you'd",
 "you'll",
 "you're",
 "you've",
 'your',
 'yours',
 'yourself',
 'yourselves'}

from nltk.tokenize import word_tokenize
word_tokens = word_tokenize(text)

word_tokens

['linux',
 'systems',
 'are',
 'used',
 'by',
 'less',
 'than',
 'of',
 'people',
 'while',
 'of',
 'developpers',
 'are',
 'using',
 'linux',
 'maybe',
 'people',
 'should',
 'use',
 'it',
 'more',
 'often']

text = [w for w in word_tokens if not w in stop_words] 
text

['linux',
 'systems',
 'used',
 'less',
 'people',
 'developpers',
 'using',
 'linux',
 'maybe',
 'people',
 'use',
 'often']

text

['linux',
 'systems',
 'used',
 'less',
 'people',
 'developpers',
 'using',
 'linux',
 'maybe',
 'people',
 'use',
 'often']

from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
stemmed = [stemmer.stem(word) for word in text]
stemmed

['linux',
 'system',
 'use',
 'less',
 'peopl',
 'developp',
 'use',
 'linux',
 'mayb',
 'peopl',
 'use',
 'often']

text

['linux',
 'systems',
 'used',
 'less',
 'people',
 'developpers',
 'using',
 'linux',
 'maybe',
 'people',
 'use',
 'often']

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(word) for word in text]
text = lemmatized
lemmatized

['linux',
 'system',
 'used',
 'le',
 'people',
 'developpers',
 'using',
 'linux',
 'maybe',
 'people',
 'use',
 'often']

from sklearn.feature_extraction.text import CountVectorizer
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

array(['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third',
       'this'], dtype=object)

X

<4x9 sparse matrix of type '<class 'numpy.int64'>'
	with 21 stored elements in Compressed Sparse Row format>

print(X.toarray())

[[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]

import pandas as pd
pd.DataFrame(data = X.toarray(), columns = vectorizer.get_feature_names_out())

pd.DataFrame(data = X.toarray(), columns = vectorizer.get_feature_names_out())

bigrams = CountVectorizer(analyzer='word', ngram_range=(2, 2))
X_bigrams = bigrams.fit_transform(corpus)
pd.DataFrame(data = X_bigrams.toarray(), columns = bigrams.get_feature_names_out())

vectorizer = CountVectorizer(max_df=0.8)
X = vectorizer.fit_transform(corpus)
pd.DataFrame(data = X.toarray(), columns = vectorizer.get_feature_names_out())

vectorizer = CountVectorizer(min_df=0.3)
X = vectorizer.fit_transform(corpus)
pd.DataFrame(data = X.toarray(), columns = vectorizer.get_feature_names_out())

from gensim.models import Word2Vec
from gensim.test.utils import common_texts

common_texts

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]

model = Word2Vec(sentences=common_texts, vector_size=100, window=5, min_count=1, workers=4)

model.wv['computer']

array([-0.00515774, -0.00667028, -0.0077791 ,  0.00831315, -0.00198292,
       -0.00685696, -0.0041556 ,  0.00514562, -0.00286997, -0.00375075,
        0.0016219 , -0.0027771 , -0.00158482,  0.0010748 , -0.00297881,
        0.00852176,  0.00391207, -0.00996176,  0.00626142, -0.00675622,
        0.00076966,  0.00440552, -0.00510486, -0.00211128,  0.00809783,
       -0.00424503, -0.00763848,  0.00926061, -0.00215612, -0.00472081,
        0.00857329,  0.00428459,  0.0043261 ,  0.00928722, -0.00845554,
        0.00525685,  0.00203994,  0.0041895 ,  0.00169839,  0.00446543,
        0.0044876 ,  0.0061063 , -0.00320303, -0.00457706, -0.00042664,
        0.00253447, -0.00326412,  0.00605948,  0.00415534,  0.00776685,
        0.00257002,  0.00811905, -0.00138761,  0.00808028,  0.0037181 ,
       -0.00804967, -0.00393476, -0.0024726 ,  0.00489447, -0.00087241,
       -0.00283173,  0.00783599,  0.00932561, -0.0016154 , -0.00516075,
       -0.00470313, -0.00484746, -0.00960562,  0.00137242, -0.00422615,
        0.00252744,  0.00561612, -0.00406709, -0.00959937,  0.00154715,
       -0.00670207,  0.0024959 , -0.00378173,  0.00708048,  0.00064041,
        0.00356198, -0.00273993, -0.00171105,  0.00765502,  0.00140809,
       -0.00585215, -0.00783678,  0.00123305,  0.00645651,  0.00555797,
       -0.00897966,  0.00859466,  0.00404816,  0.00747178,  0.00974917,
       -0.0072917 , -0.00904259,  0.0058377 ,  0.00939395,  0.00350795],
      dtype=float32)

model.wv.most_similar('computer', topn=3)

[('system', 0.21617139875888824),
 ('survey', 0.04468922317028046),
 ('interface', 0.015203381888568401)]

# vecteur correspondant à human - interface
model.wv.most_similar(['human','interface'], topn=3)

[('response', 0.14778193831443787),
 ('eps', 0.12549911439418793),
 ('system', 0.09567634016275406)]

Leçon: Introduction au Traitement du langage Naturel (TALN) ou (NLP)¶

Ressources¶

La librairie NLTK¶

La librairie Gensim¶

Le framework Spacy¶

Installation¶

Le framework Hugging Face¶

Les pré-traitements "standart" spécifiques au texte¶

Gestion de la casse¶

Gestion des caractères spéciaux¶

Enlever les nombres¶

Enlever la ponctuation¶

Caractères ou motifs particuliers¶

Enlever les stopwords¶

La tokenization¶

La racinisation ou stemming¶

La lemmatisation¶

La représentation Bag-of-word¶

Avantages & Inconvénients¶

Vectorisation simple¶

Vectorisation Term Frequency - Inverse Document Frequency (Tf-Idf)¶

Les paramètres de vectorisation de CountVectoriser et TfIdfVectorizer¶

Le paramètre ngram_range¶

Le paramètre max_df¶

Le paramètre min_df¶

Les words embedding / Distributional Semantic Models¶

Intérêt¶

Capturer la similarité sémantique¶

Capturer des relations entre les mots¶

Quelques méthodes d'embedding "généralistes"¶

Word2Vec¶

Continous Bag-of-words (CBoW)¶

Skip-gram¶

Exemple de Word2Vec avec gensim¶

Doc2Vec¶

GloVe : Global Vectors for word representation¶

Quelques algorithmes utilisés en TALN (avec Bag of word ou des words embedding)¶

Dans des tâches de classification: par ex filtre anti spam¶

Tâche de topic modeling:¶

Tâche de production de texte¶

Quel word embedding utiliser ?¶

Les Transformers¶

Sources¶

La librairie NLTK ¶

La librairie Gensim ¶

Le framework Spacy ¶

Installation ¶

Le framework Hugging Face ¶

La racinisation ou stemming ¶

La lemmatisation ¶

La représentation Bag-of-word ¶

Le paramètre `ngram_range`¶

Le paramètre `max_df`¶

Le paramètre `min_df`¶