Timothy Clark
Adapted from: pythonprogramming.net
# Ubuntu Linux sudo apt install -y python3 python3-pip # Mac - open a terminal!
# Windows py main.py # Mac/Linux python3 main.py
def add(a, b): print("{} + {} = {}".format(a, b, a + b)) print("Hello World!") add(2, 6)
import nltk nltk.download()
# https://www.nltk.org # Windows py -m pip install nltk # Mac/Linux pip install nltk pip3 install nltk
Splitting a sentence into words and sentences.
These are called tokens.
from nltk.tokenize import sent_tokenize, word_tokenize txt = "Hello Mr. Smith, how are you doing today? The weather is great, and NLTK is awesome. The sky is kinda blue. You shouldn't eat cardboard." sentences = sent_tokenize(txt) print(sentences) print() for s in sentences: print(s) words = word_tokenize(s) print(words)
These words are essentially the ones we don't really care about - they don't change the semantic meaning of sentences
from nltk.tokenize import sent_tokenize, word_tokenize from nltk.corpus import stopwords sentence = "Hello Mr. Smith, how are you doing today?" sentence = word_tokenize(sentence) stop_words = set(stopwords.words("english")) print(stop_words) filtered = [w for w in sentence if not w in stop_words] # Equivalent to: # filtered = [] # for w in sentence: # if w not in stop_words: # filtered.append(w) print(filtered)
When processing natural language, the semantic meaning often relates to the "stem" of the word, not the actual word itself
from nltk.stem import PorterStemmer, LancasterStemmer from nltk.tokenize import word_tokenize ps = PorterStemmer() ls = LancasterStemmer() words = ["walk", "walker", "walking", "walked", "walkly"] for w in words: print(w, ps.stem(w)) # for w in words: print(w, ls.stem(w))
PoS tagging involves working out which bits of a sentence you are looking at
import nltk from nltk.corpus import state_union from nltk.tokenize import PunktSentenceTokenizer train_text = state_union.raw("2005-GWBush.txt") sample_text = state_union.raw("2006-GWBush.txt") custom_sent_tokenizer = PunktSentenceTokenizer(train_text) tokenized = custom_sent_tokenizer.tokenize(sample_text) def process_content(): try: for i in tokenized: words = nltk.word_tokenize(i) tagged = nltk.pos_tag(words) print(tagged) except Exception as e: print(str(e)) process_content()
Involves extracting "chunks" of text, which consist of the parts of speech from before
import nltk from nltk.corpus import state_union from nltk.tokenize import PunktSentenceTokenizer train_text = state_union.raw("2005-GWBush.txt") sample_text = state_union.raw("2006-GWBush.txt") custom_sent_tokenizer = PunktSentenceTokenizer(train_text) tokenised = custom_sent_tokenizer.tokenize(sample_text) try: for i in tokenised: words = nltk.word_tokenize(i) tagged = nltk.pos_tag(words) chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}""" # 0+ Any adverb, 0+ Verb, Proper Noun chunkParser = nltk.RegexpParser(chunkGram) chunked = chunkParser.parse(tagged) print(chunked) chunked.draw() except Exception as e: print(str(e))
Removing something from a "chunk" that you have extracted from a body of text
import nltk from nltk.corpus import state_union from nltk.tokenize import PunktSentenceTokenizer train_text = state_union.raw("2005-GWBush.txt") sample_text = state_union.raw("2006-GWBush.txt") custom_sent_tokenizer = PunktSentenceTokenizer(train_text) tokenised = custom_sent_tokenizer.tokenize(sample_text) try: for i in tokenised[5:]: words = nltk.word_tokenize(i) tagged = nltk.pos_tag(words) chunkGram = r"""Chunk: {<.*>+} }<VB.?|IN|DT|TO>+{""" # Exclude these chunkParser = nltk.RegexpParser(chunkGram) chunked = chunkParser.parse(tagged) chunked.draw() except Exception as e: print(str(e))
Recognising "entities" in the text we're analysing
import nltk from nltk.corpus import state_union from nltk.tokenize import PunktSentenceTokenizer train_text = state_union.raw("2005-GWBush.txt") sample_text = state_union.raw("2006-GWBush.txt") custom_sent_tokenizer = PunktSentenceTokenizer(train_text) tokenized = custom_sent_tokenizer.tokenize(sample_text) def process_content(): try: for i in tokenized[5:]: words = nltk.word_tokenize(i) tagged = nltk.pos_tag(words) # namedEnt = nltk.ne_chunk(tagged) namedEnt = nltk.ne_chunk(tagged, binary=True) # Treat all as named entity namedEnt.draw() except Exception as e: print(str(e)) process_content()
Let's read the Bible!
from nltk.corpus import gutenberg from nltk.tokenize import sent_tokenize text = gutenberg.raw("bible-kjv.txt") tokens = sent_tokenize(text) print(tokens[5:15]) # Check out the corpus! - nltk.download()
from nltk.corpus import wordnet syns = wordnet.synsets("program") # Synset print(syns[0].name()) # Word Only print(syns[0].lemmas()[0].name()) # Definition print(syns[0].definition()) # Examples print(syns[0].examples()) # Synonyms & Antonyms synonyms = [] antonyms = [] for syn in wordnet.synsets("good"): for l in syn.lemmas(): # print("l: ", l) synonyms.append(l.name()) if (l.antonyms()): antonyms.append(l.antonyms()[0].name()) print(set(synonyms)) print(set(antonyms)) # Similarity w1 = wordnet.synset("ship.n.01") w2 = wordnet.synset("boat.n.01") print(w1.wup_similarity(w2)) w1 = wordnet.synset("ship.n.01") w2 = wordnet.synset("car.n.01") print(w1.wup_similarity(w2)) w1 = wordnet.synset("ship.n.01") w2 = wordnet.synset("cat.n.01") print(w1.wup_similarity(w2)) w1 = wordnet.synset("ship.n.01") w2 = wordnet.synset("cactus.n.01") print(w1.wup_similarity(w2))
from nltk.stem import WordNetLemmatizer lemmatizer = WordNetLemmatizer() print(lemmatizer.lemmatize("cats")) print(lemmatizer.lemmatize("cacti")) print(lemmatizer.lemmatize("geese")) print(lemmatizer.lemmatize("rocks")) print(lemmatizer.lemmatize("python")) print() print(lemmatizer.lemmatize("better")) # Default pos is NOUN print(lemmatizer.lemmatize("better", pos="a")) print(lemmatizer.lemmatize("best", pos="a")) print(lemmatizer.lemmatize("run")) print(lemmatizer.lemmatize("run", pos="v"))
What is classification?
import nltk import random from nltk.corpus import movie_reviews # One Liner documents = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] random.shuffle(documents) all_words = [] for w in movie_reviews.words(): all_words.append(w.lower()) # Convert to lowercase all_words = nltk.FreqDist(all_words) # NLTK Frequency Distribution word_features = list(all_words.keys())[:3000] # Top 3,000 words def find_features(document): words = set(document) features = {} for w in word_features: features[w] = (w in words) # Boolean return features # print(find_features(movie_reviews.words("neg/cv000_29416.txt"))) featuresets = [(find_features(rev), category) for (rev, category) in documents] training_set = featuresets[:1900] # First 1900 testing_set = featuresets[1900:] # Post 1900 # Naive Bayes Algorithm # Posterior = Prior Occurances x Likelihood / Evidence classifier = nltk.NaiveBayesClassifier.train(training_set) print("Naive Bayes Accuracy: {:f}%".format((nltk.classify.accuracy(classifier, testing_set))*100)) classifier.show_most_informative_features(15) # Word Ratios (Found more in pos or neg)
import nltk # Data Sets training_set = data[:1900] testing_set = data[1900:] classifier = nltk.NaiveBayesClassifier.train(training_set)
import nltk # Data Sets training_set = data[:1900] testing_set = data[1900:] classifier = nltk.NaiveBayesClassifier.train(training_set)
import nltk import random from nltk.corpus import movie_reviews import pickle from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB from sklearn.linear_model import LogisticRegression, SGDClassifier from sklearn.svm import SVC, LinearSVC, NuSVC from nltk.classify.scikitlearn import SklearnClassifier # One Liner documents = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] random.shuffle(documents) all_words = [] for w in movie_reviews.words(): all_words.append(w.lower()) # Convert to lowercase all_words = nltk.FreqDist(all_words) # NLTK Frequency Distribution word_features = list(all_words.keys())[:3000] # Top 3,000 words def find_features(document): words = set(document) features = {} for w in word_features: features[w] = (w in words) # Boolean return features # print(find_features(movie_reviews.words("neg/cv000_29416.txt"))) featuresets = [(find_features(rev), category) for (rev, category) in documents] training_set = featuresets[:1900] # First 1900 testing_set = featuresets[1900:] # Post 1900 # Naive Bayes Algorithm # Posterior = Prior Occurances x Likelihood / Evidence classifier = nltk.NaiveBayesClassifier.train(training_set) # Open Pickle Classifier classifier_f = open("naivebayes.pickle", "rb") classifier = pickle.load(classifier_f) classifier_f.close() print("ORIGINAL Naive Bayes Accuracy: {:.2f}%".format((nltk.classify.accuracy(classifier, testing_set))*100)) classifier.show_most_informative_features(15) # Word Ratios (Found more in pos or neg) MNB_classifier = SklearnClassifier(MultinomialNB()) # Wrapper converts to NLTK classifier MNB_classifier = MNB_classifier.train(training_set) print("MNB_classifier Accuracy: {:.2f}%".format(nltk.classify.accuracy(MNB_classifier, testing_set)*100)) BernoulliNB_classifier = SklearnClassifier(BernoulliNB()) BernoulliNB_classifier = BernoulliNB_classifier.train(training_set) print("BernoulliNB_classifier Accuracy: {:.2f}%".format(nltk.classify.accuracy(BernoulliNB_classifier, testing_set)*100)) LogisticRegression_classifier = SklearnClassifier(LogisticRegression(solver="liblinear")) LogisticRegression_classifier = LogisticRegression_classifier.train(training_set) print("LogisticRegression_classifier Accuracy: {:.2f}%".format(nltk.classify.accuracy(LogisticRegression_classifier, testing_set)*100)) SGDClassifier_classifier = SklearnClassifier(SGDClassifier()) SGDClassifier_classifier = SGDClassifier_classifier.train(training_set) print("SGDClassifier_classifier Accuracy: {:.2f}%".format(nltk.classify.accuracy(SGDClassifier_classifier, testing_set)*100)) # Very Inaccurate (< 50%) SVC_classifier = SklearnClassifier(SVC(gamma="auto")) SVC_classifier = SVC_classifier.train(training_set) print("SVC_classifier Accuracy: {:.2f}%".format(nltk.classify.accuracy(SVC_classifier, testing_set)*100)) LinearSVC_classifier = SklearnClassifier(LinearSVC()) LinearSVC_classifier = LinearSVC_classifier.train(training_set) print("LinearSVC_classifier Accuracy: {:.2f}%".format(nltk.classify.accuracy(LinearSVC_classifier, testing_set)*100)) NuSVC_classifier = SklearnClassifier(NuSVC(gamma="auto")) NuSVC_classifier = NuSVC_classifier.train(training_set) print("NuSVC_classifier Accuracy: {:.2f}%".format(nltk.classify.accuracy(NuSVC_classifier, testing_set)*100))
import nltk import random from nltk.corpus import movie_reviews import pickle from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB from sklearn.linear_model import LogisticRegression, SGDClassifier from sklearn.svm import SVC, LinearSVC, NuSVC from nltk.classify.scikitlearn import SklearnClassifier from nltk.classify import ClassifierI from statistics import mode class VoteClassifier(ClassifierI): def __init__(self, *classifiers): self._classifiers = classifiers def classify(self, features): votes = [] for c in self._classifiers: v = c.classify(features) votes.append(v) return mode(votes) def confidence(self, features): votes = [] for c in self._classifiers: v = c.classify(features) votes.append(v) choice_votes = votes.count(mode(votes)) conf = choice_votes / len(votes) return conf # One Liner documents = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] random.shuffle(documents) all_words = [] for w in movie_reviews.words(): all_words.append(w.lower()) # Convert to lowercase all_words = nltk.FreqDist(all_words) # NLTK Frequency Distribution word_features = list(all_words.keys())[:3000] # Top 3,000 words def find_features(document): words = set(document) features = {} for w in word_features: features[w] = (w in words) # Boolean return features # print(find_features(movie_reviews.words("neg/cv000_29416.txt"))) featuresets = [(find_features(rev), category) for (rev, category) in documents] training_set = featuresets[:1900] # First 1900 testing_set = featuresets[1900:] # Post 1900 # Naive Bayes Algorithm # Posterior = Prior Occurances x Likelihood / Evidence classifier = nltk.NaiveBayesClassifier.train(training_set) # Open Pickle Classifier classifier_f = open("naivebayes.pickle", "rb") classifier = pickle.load(classifier_f) classifier_f.close() print("ORIGINAL Naive Bayes Accuracy: {:.2f}%".format((nltk.classify.accuracy(classifier, testing_set))*100)) classifier.show_most_informative_features(15) # Word Ratios (Found more in pos or neg) MNB_classifier = SklearnClassifier(MultinomialNB()) # Wrapper converts to NLTK classifier MNB_classifier = MNB_classifier.train(training_set) print("MNB_classifier Accuracy: {:.2f}%".format(nltk.classify.accuracy(MNB_classifier, testing_set)*100)) BernoulliNB_classifier = SklearnClassifier(BernoulliNB()) BernoulliNB_classifier = BernoulliNB_classifier.train(training_set) print("BernoulliNB_classifier Accuracy: {:.2f}%".format(nltk.classify.accuracy(BernoulliNB_classifier, testing_set)*100)) LogisticRegression_classifier = SklearnClassifier(LogisticRegression(solver="liblinear")) LogisticRegression_classifier = LogisticRegression_classifier.train(training_set) print("LogisticRegression_classifier Accuracy: {:.2f}%".format(nltk.classify.accuracy(LogisticRegression_classifier, testing_set)*100)) # Stoicastic Gradient Descent SGDClassifier_classifier = SklearnClassifier(SGDClassifier()) SGDClassifier_classifier = SGDClassifier_classifier.train(training_set) print("SGDClassifier_classifier Accuracy: {:.2f}%".format(nltk.classify.accuracy(SGDClassifier_classifier, testing_set)*100)) # Very Inaccurate (< 50%) # SVC_classifier = SklearnClassifier(SVC(gamma="auto")) # SVC_classifier = SVC_classifier.train(training_set) # print("SVC_classifier Accuracy: {:.2f}%".format(nltk.classify.accuracy(SVC_classifier, testing_set)*100)) LinearSVC_classifier = SklearnClassifier(LinearSVC()) LinearSVC_classifier = LinearSVC_classifier.train(training_set) print("LinearSVC_classifier Accuracy: {:.2f}%".format(nltk.classify.accuracy(LinearSVC_classifier, testing_set)*100)) NuSVC_classifier = SklearnClassifier(NuSVC(gamma="auto")) NuSVC_classifier = NuSVC_classifier.train(training_set) print("NuSVC_classifier Accuracy: {:.2f}%".format(nltk.classify.accuracy(NuSVC_classifier, testing_set)*100)) voted_classifier = VoteClassifier(classifier, MNB_classifier, BernoulliNB_classifier, LogisticRegression_classifier, SGDClassifier_classifier, LinearSVC_classifier, NuSVC_classifier) print("voted_classifier Accuracy: {:.2f}%".format(nltk.classify.accuracy(voted_classifier, testing_set)*100)) print("testing_set[0][0] Classification:", voted_classifier.classify(testing_set[0][0]), "Confidence: {:.5f}%".format(voted_classifier.confidence(testing_set[0][0])*100)) print("testing_set[1][0] Classification:", voted_classifier.classify(testing_set[1][0]), "Confidence: {:.5f}%".format(voted_classifier.confidence(testing_set[1][0])*100)) print("testing_set[2][0] Classification:", voted_classifier.classify(testing_set[2][0]), "Confidence: {:.5f}%".format(voted_classifier.confidence(testing_set[2][0])*100)) print("testing_set[3][0] Classification:", voted_classifier.classify(testing_set[3][0]), "Confidence: {:.5f}%".format(voted_classifier.confidence(testing_set[3][0])*100)) print("testing_set[4][0] Classification:", voted_classifier.classify(testing_set[4][0]), "Confidence: {:.5f}%".format(voted_classifier.confidence(testing_set[4][0])*100)) print("testing_set[5][0] Classification:", voted_classifier.classify(testing_set[5][0]), "Confidence: {:.5f}%".format(voted_classifier.confidence(testing_set[5][0])*100))
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Morbi nec metus justo. Aliquam erat volutpat.
#File: sentiment_mod.py import nltk import random # from nltk.corpus import movie_reviews import pickle from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB from sklearn.linear_model import LogisticRegression, SGDClassifier from sklearn.svm import SVC, LinearSVC, NuSVC from nltk.classify.scikitlearn import SklearnClassifier from nltk.classify import ClassifierI from statistics import mode from nltk.tokenize import word_tokenize def load_pickle(filename): classifier_f = open("pickle/"+filename, "rb") classifier = pickle.load(classifier_f) classifier_f.close() return classifier def save_pickle(classifier, filename): save_classifier = open("pickle/"+filename, "wb") pickle.dump(classifier, save_classifier) save_classifier.close() class VoteClassifier(ClassifierI): def __init__(self, *classifiers): self._classifiers = classifiers def classify(self, features): votes = [] for c in self._classifiers: v = c.classify(features) votes.append(v) return mode(votes) def confidence(self, features): votes = [] for c in self._classifiers: v = c.classify(features) votes.append(v) choice_votes = votes.count(mode(votes)) conf = choice_votes / len(votes) return conf def find_features(document): words = word_tokenize(document) features = {} for w in word_features: features[w] = (w in words) # Boolean return features documents = load_pickle("documents.pickle") word_features = load_pickle("word_features_5k.pickle") # featuresets = [(find_features(rev), category) for (rev, category) in documents] # save_pickle(featuresets, "featuresets.pickle") featuresets = load_pickle("featuresets.pickle") random.shuffle(featuresets) training_set = featuresets[:10000] testing_set = featuresets[10000:] # Naive Bayes Algorithm: Posterior = Prior Occurances x Likelihood / Evidence classifier = load_pickle("short_reviews.pickle") MNB_classifier = load_pickle("MNB_classifier.pickle") # print("MNB_classifier Accuracy: {:.2f}%".format(nltk.classify.accuracy(MNB_classifier, testing_set)*100)) BernoulliNB_classifier = load_pickle("BernoulliNB_classifier.pickle") # print("BernoulliNB_classifier Accuracy: {:.2f}%".format(nltk.classify.accuracy(BernoulliNB_classifier, testing_set)*100)) LogisticRegression_classifier = load_pickle("LogisticRegression_classifier.pickle") # print("LogisticRegression_classifier Accuracy: {:.2f}%".format(nltk.classify.accuracy(LogisticRegression_classifier, testing_set)*100)) LinearSVC_classifier = load_pickle("LinearSVC_classifier.pickle") # print("LinearSVC_classifier Accuracy: {:.2f}%".format(nltk.classify.accuracy(LinearSVC_classifier, testing_set)*100)) voted_classifier = VoteClassifier( classifier, MNB_classifier, BernoulliNB_classifier, LogisticRegression_classifier, LinearSVC_classifier, ) print("Vote Classifier Accuracy: {:.2f}%".format(nltk.classify.accuracy(voted_classifier, testing_set)*100)) def sentiment(text): feats = find_features(text) return voted_classifier.classify(feats),voted_classifier.confidence(feats)
import sentiment_mod as s # Positive print(s.sentiment("This movie was awesome! The acting was great, plot was wonderful, and there were pythons...so yea!")) # Negative print(s.sentiment("This movie was utter junk. There were absolutely 0 pythons. I don't see what the point was at all. Horrible movie, 0/10"))
from tweepy import Stream from tweepy import OAuthHandler from tweepy.streaming import StreamListener import json import sentiment_mod as s #consumer key, consumer secret, access token, access secret #ckey, csecret, akey, asecret from secret import * class listener(StreamListener): def on_data(self, data): all_data = json.loads(data) tweet = all_data["text"] sentiment_value, confidence = s.sentiment(tweet) print(tweet, sentiment_value, confidence) if confidence*100 >= 95: output = open("twitter-out.txt", "a") output.write(sentiment_value) output.write('\n') output.close() return True def on_error(self, status): print(status) auth = OAuthHandler(ckey, csecret) auth.set_access_token(atoken, asecret) open("twitter-out.txt", "w").close() # Clear file twitterStream = Stream(auth, listener()) word = input("Keyword(s) to track: ") twitterStream.filter(track=[word])
import matplotlib.pyplot as plt import matplotlib.animation as animation from matplotlib import style import time style.use("ggplot") fig = plt.figure() ax1 = fig.add_subplot(1,1,1) fig.canvas.set_window_title("Live Twitter Sentiment") def animate(i): pullData = open("twitter-out.txt","r").read() lines = pullData.split('\n') xar = [] yar = [] x = 0 y = 0 for l in lines[-200:]: x += 1 # Move x along if "pos" in l: y += 1 elif "neg" in l: y -= 1 xar.append(x) yar.append(y) ax1.clear() ax1.plot(xar,yar) ani = animation.FuncAnimation(fig, animate, interval=1000) plt.show()
tdhc.uk/nltk-talk
Adapted from: pythonprogramming.net
Using: NLTK (nltk.org)
Photos/Gradients: Unsplash
Music: youtu.be/YPyyvmnT1ng