Timothy Clark
Adapted from: pythonprogramming.net
# Ubuntu Linux
sudo apt install -y python3 python3-pip
# Mac - open a terminal!
# Windows
py main.py
# Mac/Linux
python3 main.py
def add(a, b):
print("{} + {} = {}".format(a, b, a + b))
print("Hello World!")
add(2, 6)
import nltk
nltk.download()
# https://www.nltk.org
# Windows
py -m pip install nltk
# Mac/Linux
pip install nltk
pip3 install nltk
Splitting a sentence into words and sentences.
These are called tokens.
from nltk.tokenize import sent_tokenize, word_tokenize
txt = "Hello Mr. Smith, how are you doing today? The weather is great, and NLTK is awesome. The sky is kinda blue. You shouldn't eat cardboard."
sentences = sent_tokenize(txt)
print(sentences)
print()
for s in sentences:
print(s)
words = word_tokenize(s)
print(words)
These words are essentially the ones we don't really care about - they don't change the semantic meaning of sentences
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
sentence = "Hello Mr. Smith, how are you doing today?"
sentence = word_tokenize(sentence)
stop_words = set(stopwords.words("english"))
print(stop_words)
filtered = [w for w in sentence if not w in stop_words]
# Equivalent to:
# filtered = []
# for w in sentence:
# if w not in stop_words:
# filtered.append(w)
print(filtered)
When processing natural language, the semantic meaning often relates to the "stem" of the word, not the actual word itself
from nltk.stem import PorterStemmer, LancasterStemmer
from nltk.tokenize import word_tokenize
ps = PorterStemmer()
ls = LancasterStemmer()
words = ["walk", "walker", "walking", "walked", "walkly"]
for w in words: print(w, ps.stem(w))
# for w in words: print(w, ls.stem(w))
PoS tagging involves working out which bits of a sentence you are looking at
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer
train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
tokenized = custom_sent_tokenizer.tokenize(sample_text)
def process_content():
try:
for i in tokenized:
words = nltk.word_tokenize(i)
tagged = nltk.pos_tag(words)
print(tagged)
except Exception as e:
print(str(e))
process_content()
Involves extracting "chunks" of text, which consist of the parts of speech from before
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer
train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
tokenised = custom_sent_tokenizer.tokenize(sample_text)
try:
for i in tokenised:
words = nltk.word_tokenize(i)
tagged = nltk.pos_tag(words)
chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}""" # 0+ Any adverb, 0+ Verb, Proper Noun
chunkParser = nltk.RegexpParser(chunkGram)
chunked = chunkParser.parse(tagged)
print(chunked)
chunked.draw()
except Exception as e:
print(str(e))
Removing something from a "chunk" that you have extracted from a body of text
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer
train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
tokenised = custom_sent_tokenizer.tokenize(sample_text)
try:
for i in tokenised[5:]:
words = nltk.word_tokenize(i)
tagged = nltk.pos_tag(words)
chunkGram = r"""Chunk: {<.*>+}
}<VB.?|IN|DT|TO>+{""" # Exclude these
chunkParser = nltk.RegexpParser(chunkGram)
chunked = chunkParser.parse(tagged)
chunked.draw()
except Exception as e:
print(str(e))
Recognising "entities" in the text we're analysing
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer
train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
tokenized = custom_sent_tokenizer.tokenize(sample_text)
def process_content():
try:
for i in tokenized[5:]:
words = nltk.word_tokenize(i)
tagged = nltk.pos_tag(words)
# namedEnt = nltk.ne_chunk(tagged)
namedEnt = nltk.ne_chunk(tagged, binary=True) # Treat all as named entity
namedEnt.draw()
except Exception as e:
print(str(e))
process_content()
Let's read the Bible!
from nltk.corpus import gutenberg
from nltk.tokenize import sent_tokenize
text = gutenberg.raw("bible-kjv.txt")
tokens = sent_tokenize(text)
print(tokens[5:15])
# Check out the corpus! - nltk.download()
from nltk.corpus import wordnet
syns = wordnet.synsets("program")
# Synset
print(syns[0].name())
# Word Only
print(syns[0].lemmas()[0].name())
# Definition
print(syns[0].definition())
# Examples
print(syns[0].examples())
# Synonyms & Antonyms
synonyms = []
antonyms = []
for syn in wordnet.synsets("good"):
for l in syn.lemmas():
# print("l: ", l)
synonyms.append(l.name())
if (l.antonyms()):
antonyms.append(l.antonyms()[0].name())
print(set(synonyms))
print(set(antonyms))
# Similarity
w1 = wordnet.synset("ship.n.01")
w2 = wordnet.synset("boat.n.01")
print(w1.wup_similarity(w2))
w1 = wordnet.synset("ship.n.01")
w2 = wordnet.synset("car.n.01")
print(w1.wup_similarity(w2))
w1 = wordnet.synset("ship.n.01")
w2 = wordnet.synset("cat.n.01")
print(w1.wup_similarity(w2))
w1 = wordnet.synset("ship.n.01")
w2 = wordnet.synset("cactus.n.01")
print(w1.wup_similarity(w2))
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize("cats"))
print(lemmatizer.lemmatize("cacti"))
print(lemmatizer.lemmatize("geese"))
print(lemmatizer.lemmatize("rocks"))
print(lemmatizer.lemmatize("python"))
print()
print(lemmatizer.lemmatize("better")) # Default pos is NOUN
print(lemmatizer.lemmatize("better", pos="a"))
print(lemmatizer.lemmatize("best", pos="a"))
print(lemmatizer.lemmatize("run"))
print(lemmatizer.lemmatize("run", pos="v"))
What is classification?
import nltk
import random
from nltk.corpus import movie_reviews
# One Liner
documents = [(list(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)
all_words = []
for w in movie_reviews.words():
all_words.append(w.lower()) # Convert to lowercase
all_words = nltk.FreqDist(all_words) # NLTK Frequency Distribution
word_features = list(all_words.keys())[:3000] # Top 3,000 words
def find_features(document):
words = set(document)
features = {}
for w in word_features:
features[w] = (w in words) # Boolean
return features
# print(find_features(movie_reviews.words("neg/cv000_29416.txt")))
featuresets = [(find_features(rev), category) for (rev, category) in documents]
training_set = featuresets[:1900] # First 1900
testing_set = featuresets[1900:] # Post 1900
# Naive Bayes Algorithm
# Posterior = Prior Occurances x Likelihood / Evidence
classifier = nltk.NaiveBayesClassifier.train(training_set)
print("Naive Bayes Accuracy: {:f}%".format((nltk.classify.accuracy(classifier, testing_set))*100))
classifier.show_most_informative_features(15)
# Word Ratios (Found more in pos or neg)
import nltk
# Data Sets
training_set = data[:1900]
testing_set = data[1900:]
classifier = nltk.NaiveBayesClassifier.train(training_set)
import nltk
# Data Sets
training_set = data[:1900]
testing_set = data[1900:]
classifier = nltk.NaiveBayesClassifier.train(training_set)
import nltk
import random
from nltk.corpus import movie_reviews
import pickle
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from nltk.classify.scikitlearn import SklearnClassifier
# One Liner
documents = [(list(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)
all_words = []
for w in movie_reviews.words():
all_words.append(w.lower()) # Convert to lowercase
all_words = nltk.FreqDist(all_words) # NLTK Frequency Distribution
word_features = list(all_words.keys())[:3000] # Top 3,000 words
def find_features(document):
words = set(document)
features = {}
for w in word_features:
features[w] = (w in words) # Boolean
return features
# print(find_features(movie_reviews.words("neg/cv000_29416.txt")))
featuresets = [(find_features(rev), category) for (rev, category) in documents]
training_set = featuresets[:1900] # First 1900
testing_set = featuresets[1900:] # Post 1900
# Naive Bayes Algorithm
# Posterior = Prior Occurances x Likelihood / Evidence
classifier = nltk.NaiveBayesClassifier.train(training_set)
# Open Pickle Classifier
classifier_f = open("naivebayes.pickle", "rb")
classifier = pickle.load(classifier_f)
classifier_f.close()
print("ORIGINAL Naive Bayes Accuracy: {:.2f}%".format((nltk.classify.accuracy(classifier, testing_set))*100))
classifier.show_most_informative_features(15)
# Word Ratios (Found more in pos or neg)
MNB_classifier = SklearnClassifier(MultinomialNB()) # Wrapper converts to NLTK classifier
MNB_classifier = MNB_classifier.train(training_set)
print("MNB_classifier Accuracy: {:.2f}%".format(nltk.classify.accuracy(MNB_classifier, testing_set)*100))
BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier = BernoulliNB_classifier.train(training_set)
print("BernoulliNB_classifier Accuracy: {:.2f}%".format(nltk.classify.accuracy(BernoulliNB_classifier, testing_set)*100))
LogisticRegression_classifier = SklearnClassifier(LogisticRegression(solver="liblinear"))
LogisticRegression_classifier = LogisticRegression_classifier.train(training_set)
print("LogisticRegression_classifier Accuracy: {:.2f}%".format(nltk.classify.accuracy(LogisticRegression_classifier, testing_set)*100))
SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier = SGDClassifier_classifier.train(training_set)
print("SGDClassifier_classifier Accuracy: {:.2f}%".format(nltk.classify.accuracy(SGDClassifier_classifier, testing_set)*100))
# Very Inaccurate (< 50%)
SVC_classifier = SklearnClassifier(SVC(gamma="auto"))
SVC_classifier = SVC_classifier.train(training_set)
print("SVC_classifier Accuracy: {:.2f}%".format(nltk.classify.accuracy(SVC_classifier, testing_set)*100))
LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier = LinearSVC_classifier.train(training_set)
print("LinearSVC_classifier Accuracy: {:.2f}%".format(nltk.classify.accuracy(LinearSVC_classifier, testing_set)*100))
NuSVC_classifier = SklearnClassifier(NuSVC(gamma="auto"))
NuSVC_classifier = NuSVC_classifier.train(training_set)
print("NuSVC_classifier Accuracy: {:.2f}%".format(nltk.classify.accuracy(NuSVC_classifier, testing_set)*100))
import nltk
import random
from nltk.corpus import movie_reviews
import pickle
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from nltk.classify.scikitlearn import SklearnClassifier
from nltk.classify import ClassifierI
from statistics import mode
class VoteClassifier(ClassifierI):
def __init__(self, *classifiers):
self._classifiers = classifiers
def classify(self, features):
votes = []
for c in self._classifiers:
v = c.classify(features)
votes.append(v)
return mode(votes)
def confidence(self, features):
votes = []
for c in self._classifiers:
v = c.classify(features)
votes.append(v)
choice_votes = votes.count(mode(votes))
conf = choice_votes / len(votes)
return conf
# One Liner
documents = [(list(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)
all_words = []
for w in movie_reviews.words():
all_words.append(w.lower()) # Convert to lowercase
all_words = nltk.FreqDist(all_words) # NLTK Frequency Distribution
word_features = list(all_words.keys())[:3000] # Top 3,000 words
def find_features(document):
words = set(document)
features = {}
for w in word_features:
features[w] = (w in words) # Boolean
return features
# print(find_features(movie_reviews.words("neg/cv000_29416.txt")))
featuresets = [(find_features(rev), category) for (rev, category) in documents]
training_set = featuresets[:1900] # First 1900
testing_set = featuresets[1900:] # Post 1900
# Naive Bayes Algorithm
# Posterior = Prior Occurances x Likelihood / Evidence
classifier = nltk.NaiveBayesClassifier.train(training_set)
# Open Pickle Classifier
classifier_f = open("naivebayes.pickle", "rb")
classifier = pickle.load(classifier_f)
classifier_f.close()
print("ORIGINAL Naive Bayes Accuracy: {:.2f}%".format((nltk.classify.accuracy(classifier, testing_set))*100))
classifier.show_most_informative_features(15)
# Word Ratios (Found more in pos or neg)
MNB_classifier = SklearnClassifier(MultinomialNB()) # Wrapper converts to NLTK classifier
MNB_classifier = MNB_classifier.train(training_set)
print("MNB_classifier Accuracy: {:.2f}%".format(nltk.classify.accuracy(MNB_classifier, testing_set)*100))
BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier = BernoulliNB_classifier.train(training_set)
print("BernoulliNB_classifier Accuracy: {:.2f}%".format(nltk.classify.accuracy(BernoulliNB_classifier, testing_set)*100))
LogisticRegression_classifier = SklearnClassifier(LogisticRegression(solver="liblinear"))
LogisticRegression_classifier = LogisticRegression_classifier.train(training_set)
print("LogisticRegression_classifier Accuracy: {:.2f}%".format(nltk.classify.accuracy(LogisticRegression_classifier, testing_set)*100))
# Stoicastic Gradient Descent
SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier = SGDClassifier_classifier.train(training_set)
print("SGDClassifier_classifier Accuracy: {:.2f}%".format(nltk.classify.accuracy(SGDClassifier_classifier, testing_set)*100))
# Very Inaccurate (< 50%)
# SVC_classifier = SklearnClassifier(SVC(gamma="auto"))
# SVC_classifier = SVC_classifier.train(training_set)
# print("SVC_classifier Accuracy: {:.2f}%".format(nltk.classify.accuracy(SVC_classifier, testing_set)*100))
LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier = LinearSVC_classifier.train(training_set)
print("LinearSVC_classifier Accuracy: {:.2f}%".format(nltk.classify.accuracy(LinearSVC_classifier, testing_set)*100))
NuSVC_classifier = SklearnClassifier(NuSVC(gamma="auto"))
NuSVC_classifier = NuSVC_classifier.train(training_set)
print("NuSVC_classifier Accuracy: {:.2f}%".format(nltk.classify.accuracy(NuSVC_classifier, testing_set)*100))
voted_classifier = VoteClassifier(classifier, MNB_classifier, BernoulliNB_classifier, LogisticRegression_classifier, SGDClassifier_classifier, LinearSVC_classifier, NuSVC_classifier)
print("voted_classifier Accuracy: {:.2f}%".format(nltk.classify.accuracy(voted_classifier, testing_set)*100))
print("testing_set[0][0] Classification:", voted_classifier.classify(testing_set[0][0]), "Confidence: {:.5f}%".format(voted_classifier.confidence(testing_set[0][0])*100))
print("testing_set[1][0] Classification:", voted_classifier.classify(testing_set[1][0]), "Confidence: {:.5f}%".format(voted_classifier.confidence(testing_set[1][0])*100))
print("testing_set[2][0] Classification:", voted_classifier.classify(testing_set[2][0]), "Confidence: {:.5f}%".format(voted_classifier.confidence(testing_set[2][0])*100))
print("testing_set[3][0] Classification:", voted_classifier.classify(testing_set[3][0]), "Confidence: {:.5f}%".format(voted_classifier.confidence(testing_set[3][0])*100))
print("testing_set[4][0] Classification:", voted_classifier.classify(testing_set[4][0]), "Confidence: {:.5f}%".format(voted_classifier.confidence(testing_set[4][0])*100))
print("testing_set[5][0] Classification:", voted_classifier.classify(testing_set[5][0]), "Confidence: {:.5f}%".format(voted_classifier.confidence(testing_set[5][0])*100))
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Morbi nec metus justo. Aliquam erat volutpat.
#File: sentiment_mod.py
import nltk
import random
# from nltk.corpus import movie_reviews
import pickle
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from nltk.classify.scikitlearn import SklearnClassifier
from nltk.classify import ClassifierI
from statistics import mode
from nltk.tokenize import word_tokenize
def load_pickle(filename):
classifier_f = open("pickle/"+filename, "rb")
classifier = pickle.load(classifier_f)
classifier_f.close()
return classifier
def save_pickle(classifier, filename):
save_classifier = open("pickle/"+filename, "wb")
pickle.dump(classifier, save_classifier)
save_classifier.close()
class VoteClassifier(ClassifierI):
def __init__(self, *classifiers):
self._classifiers = classifiers
def classify(self, features):
votes = []
for c in self._classifiers:
v = c.classify(features)
votes.append(v)
return mode(votes)
def confidence(self, features):
votes = []
for c in self._classifiers:
v = c.classify(features)
votes.append(v)
choice_votes = votes.count(mode(votes))
conf = choice_votes / len(votes)
return conf
def find_features(document):
words = word_tokenize(document)
features = {}
for w in word_features:
features[w] = (w in words) # Boolean
return features
documents = load_pickle("documents.pickle")
word_features = load_pickle("word_features_5k.pickle")
# featuresets = [(find_features(rev), category) for (rev, category) in documents]
# save_pickle(featuresets, "featuresets.pickle")
featuresets = load_pickle("featuresets.pickle")
random.shuffle(featuresets)
training_set = featuresets[:10000]
testing_set = featuresets[10000:]
# Naive Bayes Algorithm: Posterior = Prior Occurances x Likelihood / Evidence
classifier = load_pickle("short_reviews.pickle")
MNB_classifier = load_pickle("MNB_classifier.pickle")
# print("MNB_classifier Accuracy: {:.2f}%".format(nltk.classify.accuracy(MNB_classifier, testing_set)*100))
BernoulliNB_classifier = load_pickle("BernoulliNB_classifier.pickle")
# print("BernoulliNB_classifier Accuracy: {:.2f}%".format(nltk.classify.accuracy(BernoulliNB_classifier, testing_set)*100))
LogisticRegression_classifier = load_pickle("LogisticRegression_classifier.pickle")
# print("LogisticRegression_classifier Accuracy: {:.2f}%".format(nltk.classify.accuracy(LogisticRegression_classifier, testing_set)*100))
LinearSVC_classifier = load_pickle("LinearSVC_classifier.pickle")
# print("LinearSVC_classifier Accuracy: {:.2f}%".format(nltk.classify.accuracy(LinearSVC_classifier, testing_set)*100))
voted_classifier = VoteClassifier(
classifier,
MNB_classifier,
BernoulliNB_classifier,
LogisticRegression_classifier,
LinearSVC_classifier,
)
print("Vote Classifier Accuracy: {:.2f}%".format(nltk.classify.accuracy(voted_classifier, testing_set)*100))
def sentiment(text):
feats = find_features(text)
return voted_classifier.classify(feats),voted_classifier.confidence(feats)
import sentiment_mod as s
# Positive
print(s.sentiment("This movie was awesome! The acting was great, plot was wonderful, and there were pythons...so yea!"))
# Negative
print(s.sentiment("This movie was utter junk. There were absolutely 0 pythons. I don't see what the point was at all. Horrible movie, 0/10"))
from tweepy import Stream
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
import json
import sentiment_mod as s
#consumer key, consumer secret, access token, access secret
#ckey, csecret, akey, asecret
from secret import *
class listener(StreamListener):
def on_data(self, data):
all_data = json.loads(data)
tweet = all_data["text"]
sentiment_value, confidence = s.sentiment(tweet)
print(tweet, sentiment_value, confidence)
if confidence*100 >= 95:
output = open("twitter-out.txt", "a")
output.write(sentiment_value)
output.write('\n')
output.close()
return True
def on_error(self, status):
print(status)
auth = OAuthHandler(ckey, csecret)
auth.set_access_token(atoken, asecret)
open("twitter-out.txt", "w").close() # Clear file
twitterStream = Stream(auth, listener())
word = input("Keyword(s) to track: ")
twitterStream.filter(track=[word])
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from matplotlib import style
import time
style.use("ggplot")
fig = plt.figure()
ax1 = fig.add_subplot(1,1,1)
fig.canvas.set_window_title("Live Twitter Sentiment")
def animate(i):
pullData = open("twitter-out.txt","r").read()
lines = pullData.split('\n')
xar = []
yar = []
x = 0
y = 0
for l in lines[-200:]:
x += 1 # Move x along
if "pos" in l:
y += 1
elif "neg" in l:
y -= 1
xar.append(x)
yar.append(y)
ax1.clear()
ax1.plot(xar,yar)
ani = animation.FuncAnimation(fig, animate, interval=1000)
plt.show()
tdhc.uk/nltk-talk
Adapted from: pythonprogramming.net
Using: NLTK (nltk.org)
Photos/Gradients: Unsplash
Music: youtu.be/YPyyvmnT1ng