Importing necessary libraries
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
Uploading text file
from google.colab import files
uploaded = files.upload()
for fn in uploaded.keys():
print('User uploaded file "{name}" with length {length} bytes'.format(
name=fn, length=len(uploaded[fn])))
Setting column name as ID, Revies and Sentiment and arranging data by reading text file imported above
data= pd.read_csv('/content/imdb_labelled.txt', names=['Reviews','Sentiment'], delimiter = '\t')
data.insert(0,'ID',range(0,len(data)))
data
ID | Reviews | Sentiment | |
---|---|---|---|
0 | 0 | A very, very, very slow-moving, aimless movie ... | 0 |
1 | 1 | Not sure who was more lost - the flat characte... | 0 |
2 | 2 | Attempting artiness with black & white and cle... | 0 |
3 | 3 | Very little music or anything to speak of. | 0 |
4 | 4 | The best scene in the movie was when Gerardo i... | 1 |
... | ... | ... | ... |
743 | 743 | I just got bored watching Jessice Lange take h... | 0 |
744 | 744 | Unfortunately, any virtue in this film's produ... | 0 |
745 | 745 | In a word, it is embarrassing. | 0 |
746 | 746 | Exceptionally bad! | 0 |
747 | 747 | All in all its an insult to one's intelligence... | 0 |
748 rows × 3 columns
Removing all the special characters
data=data.replace(to_replace = "[^a-zA-Z0-9]",value= " " ,regex=True)
data
ID | Reviews | Sentiment | |
---|---|---|---|
0 | 0 | A very very very slow moving aimless movie ... | 0 |
1 | 1 | Not sure who was more lost the flat characte... | 0 |
2 | 2 | Attempting artiness with black white and cle... | 0 |
3 | 3 | Very little music or anything to speak of | 0 |
4 | 4 | The best scene in the movie was when Gerardo i... | 1 |
... | ... | ... | ... |
743 | 743 | I just got bored watching Jessice Lange take h... | 0 |
744 | 744 | Unfortunately any virtue in this film s produ... | 0 |
745 | 745 | In a word it is embarrassing | 0 |
746 | 746 | Exceptionally bad | 0 |
747 | 747 | All in all its an insult to one s intelligence... | 0 |
748 rows × 3 columns
Plotting graph using seaborn with sentiments as Positive and and Negative Reviews
plt.figure(figsize=(8,8))
sns.countplot(data['Sentiment'])
plt.suptitle("Sentiment(0=Negative,1=Positive)")
plt.ylabel("count")
plt.show()
/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. FutureWarning
Splitting Train,Test and Development data We have 448 Train set,150 Test Set and 150 Development
s1=int(0.6 *len(data))
s2=int(0.7 *len(data))
train_data = data[:s1]
test_data =data[s2:]
dev_data=data[s1:s2]
print("Size of Train data: ",len(train_data))
print("Size of Test data: ",len(test_data))
print("Size of Development data: ",len(dev_data))
Size of Train data: 448 Size of Test data: 225 Size of Development data: 75
Plotting with Train,Test and Develpoment Graph of Negative and Positive Sentiments
fig,ax = plt.subplots(1,3,figsize = (15,8))
sns.countplot(train_data['Sentiment'],ax=ax[0])
plt.suptitle("Train_data[(0=Negative , 1=Positive")
sns.countplot(test_data['Sentiment'],ax=ax[1])
plt.suptitle("Train_data[(0=Negative , 1=Positive")
sns.countplot(dev_data['Sentiment'],ax=ax[2])
plt.suptitle("Train_data[(0=Negative , 1=Positive")
/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. FutureWarning /usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. FutureWarning /usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. FutureWarning
Text(0.5, 0.98, 'Train_data[(0=Negative , 1=Positive')
Now, need to create a vocabulary list
wordlist=dict()
ID=0
for i in train_data.Reviews:
ID=ID+1
for j in i.split():
j=j.lower()
if(j in wordlist.keys()):
wordlist[j].append(ID)
else:
wordlist[j] = list()
wordlist[j].append(ID)
for i in wordlist:
wordlist[i] = len(set(wordlist[i]))
wordlist
{'a': 158, 'very': 31, 'slow': 3, 'moving': 2, 'aimless': 1, 'movie': 85, 'about': 21, 'distressed': 1, 'drifting': 1, 'young': 3, 'man': 8, 'not': 33, 'sure': 3, 'who': 21, 'was': 95, 'more': 21, 'lost': 3, 'the': 220, 'flat': 2, 'characters': 19, 'or': 21, 'audience': 4, 'nearly': 1, 'half': 6, 'of': 131, 'whom': 2, 'walked': 1, 'out': 27, 'attempting': 1, 'artiness': 1, 'with': 47, 'black': 7, 'white': 4, 'and': 166, 'clever': 3, 'camera': 8, 'angles': 2, 'disappointed': 5, 'became': 1, 'even': 26, 'ridiculous': 4, 'as': 40, 'acting': 21, 'poor': 5, 'plot': 18, 'lines': 5, 'almost': 8, 'non': 4, 'existent': 1, 'little': 13, 'music': 6, 'anything': 5, 'to': 107, 'speak': 3, 'best': 15, 'scene': 5, 'in': 81, 'when': 14, 'gerardo': 1, 'is': 133, 'trying': 3, 'find': 6, 'song': 2, 'that': 77, 'keeps': 1, 'running': 1, 'through': 8, 'his': 18, 'head': 3, 'rest': 4, 'lacks': 4, 'art': 7, 'charm': 2, 'meaning': 2, 'if': 21, 'it': 125, 's': 58, 'emptiness': 1, 'works': 4, 'i': 128, 'guess': 4, 'because': 17, 'empty': 1, 'wasted': 5, 'two': 4, 'hours': 3, 'saw': 7, 'today': 4, 'thought': 9, 'good': 24, 'effort': 1, 'messages': 1, 'for': 53, 'kids': 6, 'bit': 3, 'predictable': 6, 'loved': 5, 'casting': 5, 'jimmy': 2, 'buffet': 1, 'science': 1, 'teacher': 1, 'those': 8, 'baby': 1, 'owls': 1, 'were': 24, 'adorable': 4, 'showed': 2, 'lot': 4, 'florida': 1, 'at': 29, 'made': 14, 'look': 11, 'appealing': 2, 'songs': 4, 'muppets': 1, 'so': 37, 'hilarious': 4, 'cool': 5, 'this': 127, 'right': 5, 'on': 30, 'case': 1, 'delivers': 3, 'everything': 6, 'your': 10, 'face': 3, 'had': 14, 'some': 16, 'average': 3, 'from': 21, 'main': 1, 'person': 1, 'low': 5, 'budget': 5, 'you': 30, 'clearly': 2, 'can': 14, 'see': 24, 'review': 2, 'long': 5, 'overdue': 1, 'since': 3, 'consider': 3, 'tale': 1, 'sisters': 1, 'be': 32, 'single': 4, 'greatest': 3, 'film': 66, 'ever': 13, 'll': 6, 'put': 3, 'gem': 1, 'up': 13, 'against': 1, 'any': 10, 'terms': 2, 'screenplay': 1, 'cinematography': 6, 'post': 1, 'production': 2, 'editing': 3, 'directing': 6, 'other': 14, 'aspect': 1, 'making': 4, 'practically': 1, 'perfect': 4, 'all': 30, 'them': 12, 'true': 3, 'masterpiece': 2, 'sea': 1, 'faux': 2, 'masterpieces': 1, 'structure': 1, 'easily': 2, 'most': 15, 'tightly': 1, 'constructed': 1, 'history': 5, 'cinema': 5, '1': 8, 'think': 12, 'no': 20, 'where': 7, 'something': 4, 'vitally': 1, 'important': 2, 'occurs': 1, 'every': 10, 'minute': 3, 'words': 2, 'content': 1, 'level': 3, 'enough': 8, 'fill': 1, 'dozen': 1, 'films': 9, 'how': 14, 'anyone': 12, 'their': 6, 'mind': 2, 'ask': 2, 'than': 16, 'quite': 7, 'simply': 6, 'highest': 1, 'superlative': 1, 'form': 1, 'imaginable': 2, 'yes': 3, 'does': 5, 'require': 1, 'rather': 3, 'significant': 2, 'amount': 3, 'puzzle': 1, 'solving': 1, 'but': 51, 'pieces': 2, 'fit': 1, 'together': 5, 'create': 1, 'beautiful': 8, 'picture': 3, 'short': 5, 'certainly': 6, 'pulls': 1, 'punches': 1, '0': 6, 'graphics': 1, 'far': 4, 'part': 7, 'game': 1, 'number': 1, 'one': 21, 'th': 1, 'series': 3, 'deserves': 1, 'strong': 2, 'love': 12, 'an': 25, 'insane': 1, 'there': 34, 'are': 38, 'massive': 1, 'levels': 2, 'unlockable': 1, 'just': 27, 'waste': 7, 'money': 2, 'kind': 6, 'properly': 1, 'actually': 6, 'time': 18, 'crap': 4, 'they': 15, 'say': 9, 'canada': 2, 'fun': 3, 'aye': 1, 'rocks': 1, 'buy': 2, 'play': 5, 'enjoy': 3, 'pure': 3, 'brilliance': 1, 'flick': 4, 'doomed': 1, 'its': 9, 'conception': 1, 'idea': 4, 'lame': 3, 'take': 4, 'minor': 1, 'character': 10, 'mediocre': 2, 'pg': 2, '13': 2, 'make': 14, 'complete': 1, 'sequel': 2, 'while': 3, 'changing': 1, 'tone': 1, 'rated': 2, 'family': 5, 'wasn': 6, 't': 44, 'least': 3, 'interested': 2, 'only': 19, 'did': 8, 'confirm': 1, 'would': 17, 'unfunny': 2, 'generic': 1, 'also': 14, 'managed': 1, 'give': 7, 'away': 4, 'entire': 3, 'm': 4, 'exaggerating': 1, 'moment': 3, 'point': 2, 'joke': 2, 'told': 3, 'trailer': 1, 'funny': 13, 'talented': 3, 'carrell': 1, 'save': 1, 'co': 1, 'stars': 1, 'don': 13, 'fare': 1, 'much': 15, 'better': 7, 'people': 6, 'like': 20, 'morgan': 1, 'freeman': 1, 'jonah': 2, 'hill': 2, 'ed': 2, 'helms': 1, 'story': 9, 'itself': 2, 'lazy': 1, 'real': 12, 'effects': 4, 'work': 14, 'presence': 2, 'animals': 1, 'integration': 1, 'into': 10, 'scenes': 8, 'worst': 4, 'obvious': 1, 'blue': 1, 'green': 1, 'screen': 6, 've': 7, 'seen': 10, 'whatever': 5, 'cost': 2, 'didn': 11, 'translate': 1, 'quality': 2, 'succeeds': 1, 'despite': 1, 'perhaps': 3, 'obviously': 3, 'meagre': 1, 'glad': 2, 'go': 8, 'choice': 1, 'lesser': 1, 'have': 22, 'addition': 2, 'having': 5, 'lovely': 2, 'written': 5, 'french': 2, 'cancan': 1, 'boasts': 1, 'cutest': 1, 'leading': 1, 'ladies': 1, 'grace': 2, 'hard': 3, 'fall': 1, 'over': 10, 'heels': 1, 'girl': 5, 'negative': 3, 'insipid': 1, 'cause': 2, 'regret': 1, 'another': 4, '2': 1, 'life': 7, 'front': 2, 'whiny': 1, 'pointless': 1, 'recommend': 7, 'waiting': 1, 'future': 1, 'efforts': 1, 'let': 5, 'excellent': 6, 'cast': 14, 'line': 3, 'performances': 3, 'totally': 8, 'believable': 2, 'anne': 1, 'heche': 1, 'utterly': 3, 'convincing': 3, 'sam': 1, 'shepard': 1, 'portrayal': 3, 'gung': 1, 'ho': 1, 'marine': 1, 'sobering': 1, 'sat': 1, 'riveted': 1, 'tv': 2, 'resounding': 1, '9': 1, '10': 16, 'do': 12, 'tom': 1, 'hanks': 1, 'actor': 5, 'enjoyed': 6, 'reading': 1, 'book': 1, 'my': 23, 'children': 2, 'annoying': 6, 'voice': 2, 'gives': 2, 'me': 12, 'feeling': 6, 'fingernails': 1, 'chalkboard': 1, 'unnecessary': 1, 'train': 1, 'roller': 1, 'coaster': 1, 'absolutely': 7, 'warmth': 1, 'these': 6, 'grates': 1, 'nerves': 1, 'improved': 1, 'by': 24, 'improvisation': 1, 'actors': 11, 'now': 7, 'twice': 2, 'worry': 1, 'whether': 3, 're': 3, 'delivering': 1, 'well': 20, 'honestly': 1, 'often': 4, 'dialogue': 6, 'doesn': 8, 'really': 26, 'follow': 1, 'surroundings': 1, 'crackles': 1, 'unpredictable': 2, 'youthful': 1, 'energy': 4, 'found': 6, 'concentrate': 1, 'meanders': 1, 'badly': 2, 'generally': 3, 'great': 18, 'things': 11, 'wouldn': 3, 'worth': 8, 'though': 9, 'suspense': 5, 'builders': 1, 'cross': 1, 'g': 2, 'especially': 5, 'liked': 4, 'cliche': 1, 'choices': 1, 'parents': 2, 'movies': 10, 'could': 16, 'predict': 1, 'dialog': 2, 'verbatim': 1, 'writing': 11, 'selections': 1, 'want': 4, 'gross': 1, 'chills': 1, 'alexander': 1, 'nevsky': 1, 'he': 17, 'amazing': 3, 'artist': 2, 'whoever': 1, 'lived': 1, 'pretentious': 3, 'piece': 6, 'planned': 1, 'dodge': 1, 'stratus': 1, 'big': 6, 'shots': 3, 'gonna': 1, 'help': 1, 'makers': 1, 'aren': 2, 'restrained': 1, 'business': 1, 'qu': 1, 'bec': 1, 'first': 7, 'given': 3, 'years': 7, 'needed': 2, 'word': 4, 'mouth': 1, 'promote': 1, 'overall': 5, 'interesting': 6, 'provoking': 1, 'plus': 2, 'paced': 1, 'suited': 1, 'relatively': 1, 'run': 2, 'gave': 4, 'wind': 3, 'lion': 3, 'superbly': 2, 'acted': 2, 'classic': 5, 'turned': 2, 'pretty': 8, 'decent': 3, 'b': 2, 'list': 2, 'horror': 5, 'definitely': 6, 'checking': 3, 'problem': 2, 'script': 14, 'horrendous': 1, 'nothing': 7, 'frustration': 1, 'retarded': 1, 'girls': 1, 'manna': 1, 'heaven': 1, 'terrific': 3, 'both': 7, 'same': 3, 'occasionally': 4, 'touching': 2, 'evaluate': 1, 'lives': 1, 'going': 8, 'veteran': 1, 'nostalgia': 1, 'trip': 1, 'ursula': 1, 'burton': 1, 'nun': 1, 'nuns': 1, 'church': 2, 'looking': 1, 'including': 3, 'shirley': 1, 'jones': 1, 'rendition': 1, 'way': 14, 'tonight': 1, 'uplifting': 2, 'ending': 8, 'try': 2, 'will': 11, 'frankly': 2, 'after': 9, 'cotton': 1, 'club': 1, 'unfaithful': 1, 'embarrassing': 2, 'watch': 11, 'lane': 2, 'gere': 1, 'bad': 33, 'dialogs': 1, 'extremely': 2, 'shallow': 1, 'insincere': 1, 'too': 17, 'chick': 2, 'politically': 1, 'correct': 1, 'disappointing': 2, 'thing': 8, 'watching': 12, 'scenery': 1, 'house': 3, 'never': 8, 'her': 12, 'lousy': 2, 'hour': 3, 'wish': 2, 'bring': 2, 'back': 3, 'fresh': 1, 'bold': 2, 'helps': 2, 'along': 2, 'maybe': 3, 'idiot': 2, 'savant': 1, 'sister': 1, 'been': 8, 'played': 6, 'joy': 4, 'boring': 6, 'sometimes': 3, 'myself': 5, 'occupied': 2, 'peaking': 1, 'paper': 1, 'instead': 1, 'happened': 2, 'during': 3, 'columbo': 1, 'before': 6, 'seems': 1, 'oh': 4, 'mature': 2, 'neighbour': 1, 'misplace': 1, 'weaker': 1, 'episode': 3, 'then': 3, 'debated': 1, 'sack': 1, 'trumpeter': 1, 'falsely': 1, 'accused': 1, 'murder': 1, 'stupid': 12, 'applause': 1, 'should': 5, 'prelude': 1, 'however': 5, 'director': 5, 'edge': 2, 'seat': 1, 'somewhat': 1, 'afraid': 1, 'car': 1, 'end': 4, 'night': 3, 'nice': 5, 'd': 6, 'advise': 1, 'brilliant': 5, 'playing': 5, 'villain': 1, 'rent': 2, 'michael': 1, 'ironside': 1, 'rocked': 1, 'world': 5, 'must': 5, 'social': 1, 'physical': 1, 'outlets': 1, 'fact': 2, 'tremendously': 2, 'chemistry': 2, 'between': 6, 'ben': 1, 'affleck': 1, 'sandra': 1, 'bullock': 1, 'couldn': 4, 'understand': 5, 'why': 5, 'leaving': 1, 'wife': 1, 'supposedly': 1, 'knocked': 1, 'several': 2, 'moments': 2, 'need': 1, 'excruciatingly': 1, 'remake': 3, 'friends': 1, 'wedding': 1, 'disappointment': 2, 'cannot': 2, 'believe': 7, 'agreed': 1, 'stand': 1, 'fear': 1, 'losing': 2, 'q': 1, 'nobody': 3, 'network': 1, 'aired': 1, 'dribble': 1, 'watched': 6, 'putting': 2, 'imdb': 1, 'ratings': 2, 'awful': 10, 'get': 11, 'numbers': 1, 'cases': 1, 'such': 9, 'mirrormask': 1, 'last': 2, 'unsatisfactory': 1, 'experience': 4, 'unfortunately': 3, 'inexperience': 1, 'direction': 3, 'meant': 1, 'passed': 1, 'dramatic': 2, 'tension': 1, 'conflict': 1, 'central': 1, 'themes': 2, 'handled': 2, 'ineptly': 1, 'stereotypically': 1, 'depth': 4, 'imagination': 4, 'pictures': 1, 'flawed': 2, 'core': 2, 'following': 2, 'bunch': 2, 'high': 2, 'schoolers': 1, 'whine': 1, 'cry': 1, 'relate': 1, 'hell': 1, 'barely': 4, 'cult': 3, 'viewing': 1, 'sharing': 1, 'others': 1, 'disaster': 2, 'confuses': 1, 'incredibly': 2, 'fish': 1, 'underwater': 1, 'repeated': 1, 'thousand': 1, 'times': 3, 'truly': 4, 'terrible': 9, 'worse': 9, 'possible': 2, 'redeemed': 1, 'mst3k': 1, 'fodder': 1, 'paid': 2, 'treat': 1, 'anthony': 1, 'quinn': 1, 'crazy': 2, 'horse': 1, 'whatsoever': 3, 'again': 7, 'horrible': 3, 'gosh': 1, 'walk': 1, 'theatre': 1, 'few': 4, 'minutes': 7, 'relief': 1, 'hate': 2, 'yeah': 2, 'sucked': 3, 'storyline': 4, 'pillow': 1, 'girlfriend': 2, 'boyfriend': 1, 'keep': 2, 'gone': 1, 'disliked': 2, 'tickets': 1, 'five': 1, 'dollars': 1, 'mad': 1, '7': 1, '50': 1, 'identifies': 1, 'cardboard': 2, 'cutouts': 1, 'stereotypes': 2, 'predictably': 1, 'reverse': 1, 'ugly': 1, 'cartoon': 2, 'crafted': 2, 'paul': 2, 'haggis': 1, 'handle': 1, 'strokes': 1, 'storytelling': 1, 'painted': 1, 'crayons': 1, 'crash': 1, 'depressing': 2, 'provokes': 1, 'emotion': 1, 'teaches': 1, 'already': 2, 'know': 10, 'racism': 1, 'prejudice': 1, 'still': 4, 'empowerment': 1, 'women': 3, 'brain': 1, 'attempt': 2, 'shut': 1, 'down': 2, 'primal': 1, 'impulse': 1, 'self': 4, 'preservation': 1, 'left': 2, 'shattered': 1, 'took': 2, 'fully': 1, 'recover': 1, 'joins': 1, 'revenge': 1, 'boogeyman': 1, 'zombiez': 1, 'hellish': 1, 'trinity': 1, 'mean': 4, 'distinction': 1, 'has': 18, 'redeeming': 2, 'features': 3, 'appalling': 2, 'artless': 1, 'endlessly': 1, 'presents': 1, 'us': 2, 'ugliest': 1, 'setting': 2, 'e': 1, 'beyond': 3, 'lead': 5, 'charisma': 2, 'free': 3, 'without': 2, 'merit': 1, 'akin': 1, 'torture': 4, 'reasonable': 1, 'explanation': 2, 'atrocity': 1, 'pleasant': 1, 'voyage': 1, 'discovery': 1, 'highly': 5, 'unrecommended': 1, 'premise': 4, 'sound': 4, 'performance': 5, 'ms': 1, 'garbo': 1, 'off': 7, 'bat': 1, 'talents': 1, 'carry': 1, 'silent': 3, 'era': 1, 'wanted': 2, 'netflix': 1, 'seem': 3, 'stocking': 1, 'renowned': 1, 'screenwriter': 2, 'frances': 1, 'marion': 1, 'hasn': 1, 'missed': 2, 'step': 2, 'suffered': 1, 'many': 8, 'close': 3, 'ups': 3, 'seemed': 1, 'drag': 1, 'heroes': 1, 'freedom': 2, 'below': 2, 'ones': 1, 'received': 1, 'age': 2, 'john': 4, 'wayne': 1, 'incredible': 2, 'job': 5, 'being': 5, 'industry': 1, 'shined': 1, 'senior': 1, 'him': 5, 'older': 1, 'indulgent': 2, 'junk': 1, 'probably': 6, 'hadn': 1, 'spent': 3, 'showcasing': 1, 'own': 4, 'which': 11, 'isn': 2, 'noteworthy': 2, 'got': 3, 'punched': 1, 'gallon': 1, 'blood': 1, 'spew': 1, 'forth': 1, 'soon': 1, 'jamie': 1, 'foxx': 1, 'ray': 2, 'charles': 2, 'genius': 2, 'owns': 1, 'spacek': 1, 'owned': 1, 'coal': 1, 'miner': 1, 'daughter': 2, 'quaid': 1, 'balls': 1, 'fire': 1, 'remember': 3, 'himself': 1, 'legendary': 1, 'provided': 1, 'biographical': 1, 'material': 2, 'goes': 2, 'musician': 1, 'hitchcock': 3, 'ironically': 1, 'mostly': 3, 'total': 1, 'secondly': 1, 'perfected': 1, 'thriller': 2, 'chase': 1, 'pandering': 1, 'sabotages': 1, 'hence': 1, 'whole': 3, 'certain': 1, 'rumbles': 1, 'machine': 2, 'desperately': 1, 'depending': 1, 'new': 3, 'usual': 3, 'logic': 1, 'flaws': 1, 'mishima': 1, 'uninteresting': 1, 'chilly': 1, 'unremarkable': 1, 'author': 1, 'living': 3, 'working': 1, 'abstruse': 1, 'culture': 1, 'reenactments': 1, 'hold': 1, 'attention': 1, 'emotionally': 1, 'adrift': 1, 'stagy': 1, 'sits': 1, 'soldiers': 1, 'singing': 3, 'masculinity': 1, 'pledge': 1, 'themselves': 3, 'hairsplitting': 1, 'purity': 1, ...}
Function to omit word having count less than 5
def pop_words(list1):
for i in list1.copy():
if i in list1:
if(list1[i]<=5):
list1.pop(i)
Now, counting and displaying negative and positive words,
a) Counting Negative words with Sentiment =0 and removing words with 5 or less reviews
negative_words=dict()
ID=0
for i in train_data.Reviews:
if(train_data.iloc[ID].Sentiment == 0):
ID=ID+1
for j in i.split():
j=j.lower()
if(j in negative_words.keys()):
negative_words[j].append(ID)
else:
negative_words[j]=list()
negative_words[j].append(ID)
else:
ID=ID+1
for i in negative_words:
negative_words[i] = len(set(negative_words[i]))
#pop_words(negative_words)
negative_words
{'a': 80, 'very': 20, 'slow': 3, 'moving': 2, 'aimless': 1, 'movie': 50, 'about': 14, 'distressed': 1, 'drifting': 1, 'young': 2, 'man': 4, 'not': 28, 'sure': 3, 'who': 12, 'was': 59, 'more': 9, 'lost': 1, 'the': 127, 'flat': 2, 'characters': 10, 'or': 16, 'audience': 2, 'nearly': 1, 'half': 5, 'of': 71, 'whom': 2, 'walked': 1, 'out': 12, 'attempting': 1, 'artiness': 1, 'with': 28, 'black': 4, 'white': 1, 'and': 78, 'clever': 2, 'camera': 4, 'angles': 1, 'disappointed': 4, 'became': 1, 'even': 22, 'ridiculous': 4, 'as': 22, 'acting': 16, 'poor': 5, 'plot': 16, 'lines': 5, 'almost': 6, 'non': 3, 'existent': 1, 'little': 11, 'music': 3, 'anything': 5, 'to': 59, 'speak': 2, 'rest': 3, 'lacks': 3, 'art': 2, 'charm': 2, 'meaning': 2, 'if': 13, 'it': 68, 's': 28, 'emptiness': 1, 'works': 2, 'i': 65, 'guess': 4, 'because': 12, 'empty': 1, 'wasted': 5, 'two': 3, 'hours': 3, 'bit': 3, 'predictable': 5, 'had': 7, 'some': 10, 'average': 3, 'from': 11, 'main': 1, 'person': 1, 'low': 4, 'budget': 3, 'you': 21, 'clearly': 2, 'can': 10, 'see': 12, 'structure': 1, 'this': 75, 'film': 33, 'is': 84, 'easily': 1, 'most': 9, 'tightly': 1, 'constructed': 1, 'in': 44, 'history': 2, 'cinema': 2, '1': 6, 'think': 5, 'no': 19, 'other': 6, 'where': 4, 'something': 2, 'vitally': 1, 'important': 1, 'occurs': 1, 'every': 4, 'minute': 2, 'words': 2, 'content': 1, 'level': 2, 'enough': 6, 'fill': 1, 'dozen': 1, 'films': 6, 'how': 10, 'anyone': 4, 'their': 3, 'right': 2, 'mind': 2, 'ask': 1, 'for': 34, 'than': 11, 'quite': 5, 'simply': 5, 'highest': 1, 'superlative': 1, 'form': 1, 'imaginable': 2, 'yes': 3, 'does': 3, 'require': 1, 'rather': 2, 'significant': 2, 'amount': 2, 'puzzle': 1, 'solving': 1, 'but': 30, 'pieces': 2, 'fit': 1, 'together': 4, 'create': 1, 'beautiful': 2, 'picture': 3, 'short': 3, 'certainly': 5, 'pulls': 1, 'punches': 1, '0': 5, 'graphics': 1, 'far': 3, 'best': 5, 'part': 5, 'game': 1, 'number': 1, 'one': 10, 'th': 1, 'series': 2, 'deserves': 1, 'strong': 1, 'love': 5, 'an': 12, 'insane': 1, 'there': 26, 'are': 19, 'massive': 1, 'levels': 2, 'unlockable': 1, 'just': 20, 'waste': 7, 'your': 7, 'money': 2, 'on': 20, 'kind': 5, 'that': 46, 'properly': 1, 'actually': 1, 'were': 14, 'good': 10, 'at': 19, 'time': 11, 'today': 1, 'crap': 4, 'they': 10, 'say': 4, 'canada': 1, 'fun': 1, 'aye': 1, 'rocks': 1, 'buy': 1, 'play': 2, 'enjoy': 2, 'pure': 3, 'brilliance': 1, 'flick': 3, 'doomed': 1, 'its': 2, 'conception': 1, 'idea': 4, 'lame': 3, 'take': 3, 'minor': 1, 'character': 6, 'mediocre': 2, 'pg': 2, '13': 2, 'make': 12, 'complete': 1, 'sequel': 2, 'while': 2, 'changing': 1, 'tone': 1, 'rated': 2, 'family': 1, 'wasn': 4, 't': 31, 'least': 3, 'interested': 1, 'only': 17, 'did': 3, 'confirm': 1, 'would': 14, 'be': 21, 'unfunny': 2, 'generic': 1, 'also': 9, 'managed': 1, 'give': 2, 'away': 3, 'entire': 2, 'm': 4, 'exaggerating': 1, 'moment': 1, 'point': 2, 'joke': 2, 'told': 2, 'trailer': 1, 'funny': 3, 'talented': 2, 'carrell': 1, 'save': 1, 'his': 10, 'co': 1, 'stars': 1, 'don': 7, 'fare': 1, 'much': 13, 'better': 4, 'people': 4, 'like': 14, 'morgan': 1, 'freeman': 1, 'jonah': 1, 'hill': 1, 'ed': 2, 'helms': 1, 'story': 6, 'itself': 2, 'lazy': 1, 'real': 7, 'effects': 2, 'work': 10, 'presence': 1, 'all': 22, 'animals': 1, 'integration': 1, 'those': 5, 'into': 7, 'scenes': 6, 'worst': 4, 'obvious': 1, 'blue': 1, 'green': 1, 'screen': 3, 've': 5, 'ever': 7, 'seen': 7, 'whatever': 5, 'cost': 2, 'them': 5, 'so': 22, 'didn': 10, 'translate': 1, 'quality': 1, 'succeeds': 1, 'despite': 1, 'perhaps': 2, 'obviously': 2, 'meagre': 1, 'glad': 2, 'go': 4, 'choice': 1, 'lesser': 1, 'have': 13, 'addition': 2, 'having': 3, 'lovely': 1, 'songs': 3, 'written': 4, 'french': 2, 'cancan': 1, 'boasts': 1, 'cutest': 1, 'leading': 1, 'ladies': 1, 'grace': 1, 'hard': 3, 'fall': 1, 'head': 2, 'over': 6, 'heels': 1, 'girl': 5, 'negative': 3, 'insipid': 1, 'cause': 2, 'regret': 1, 'another': 2, '2': 1, 'life': 5, 'front': 1, 'long': 2, 'whiny': 1, 'pointless': 1, 'recommend': 4, 'waiting': 1, 'future': 1, 'efforts': 1, 'let': 4, 'excellent': 2, 'cast': 6, 'line': 2, 'performances': 2, 'totally': 5, 'believable': 2, 'anne': 1, 'heche': 1, 'utterly': 3, 'convincing': 2, 'sam': 1, 'shepard': 1, 'portrayal': 1, 'gung': 1, 'ho': 1, 'marine': 1, 'sobering': 1, 'sat': 1, 'riveted': 1, 'tv': 2, 'resounding': 1, '9': 1, '10': 7, 'do': 6, 'tom': 1, 'hanks': 1, 'actor': 2, 'enjoyed': 2, 'reading': 1, 'book': 1, 'my': 11, 'children': 2, 'when': 6, 'annoying': 6, 'voice': 2, 'gives': 1, 'me': 8, 'feeling': 4, 'fingernails': 1, 'chalkboard': 1, 'unnecessary': 1, 'train': 1, 'roller': 1, 'coaster': 1, 'scene': 4, 'absolutely': 4, 'warmth': 1, 'these': 5, 'grates': 1, 'nerves': 1, 'improved': 1, 'by': 17, 'improvisation': 1, 'actors': 7, 'now': 5, 'twice': 1, 'worry': 1, 'whether': 2, 're': 3, 'delivering': 1, 'well': 8, 'any': 7, 'honestly': 1, 'often': 2, 'dialogue': 6, 'doesn': 5, 'really': 12, 'follow': 1, 'surroundings': 1, 'crackles': 1, 'unpredictable': 1, 'youthful': 1, 'energy': 3, 'found': 4, 'concentrate': 1, 'meanders': 1, 'badly': 2, 'generally': 3, 'great': 6, 'things': 9, 'wouldn': 3, 'worth': 1, 'though': 4, 'suspense': 3, 'builders': 1, 'cross': 1, 'g': 1, 'especially': 4, 'liked': 1, 'cliche': 1, 'choices': 1, 'parents': 2, 'movies': 5, 'could': 11, 'predict': 1, 'dialog': 2, 'verbatim': 1, 'writing': 8, 'made': 9, 'selections': 1, 'want': 2, 'gross': 1, 'chills': 1, 'alexander': 1, 'nevsky': 1, 'he': 10, 'amazing': 2, 'artist': 1, 'whoever': 1, 'lived': 1, 'pretentious': 3, 'piece': 4, 'planned': 1, 'dodge': 1, 'stratus': 1, 'big': 3, 'shots': 3, 'gonna': 1, 'help': 1, 'makers': 1, 'aren': 2, 'restrained': 1, 'business': 1, 'qu': 1, 'bec': 1, 'problem': 2, 'script': 11, 'horrendous': 1, 'nothing': 7, 'frustration': 1, 'retarded': 1, 'girls': 1, 'frankly': 2, 'after': 4, 'cotton': 1, 'club': 1, 'unfaithful': 1, 'embarrassing': 2, 'watch': 7, 'lane': 2, 'gere': 1, 'bad': 31, 'dialogs': 1, 'extremely': 2, 'shallow': 1, 'insincere': 1, 'too': 12, 'chick': 2, 'politically': 1, 'correct': 1, 'disappointing': 2, 'never': 6, 'her': 6, 'lousy': 2, 'hour': 3, 'wish': 2, 'bring': 2, 'back': 3, 'directing': 5, 'cinematography': 2, 'boring': 4, 'sometimes': 3, 'myself': 2, 'occupied': 2, 'peaking': 1, 'paper': 1, 'instead': 1, 'watching': 8, 'happened': 2, 'during': 1, 'columbo': 1, 'before': 3, 'look': 5, 'seems': 1, 'oh': 4, 'mature': 1, 'neighbour': 1, 'misplace': 1, 'weaker': 1, 'episode': 2, 'then': 3, 'debated': 1, 'sack': 1, 'trumpeter': 1, 'falsely': 1, 'accused': 1, 'murder': 1, 'horror': 4, 'stupid': 12, 'thought': 5, 'playing': 2, 'villain': 1, 'rent': 2, 'michael': 1, 'ironside': 1, 'however': 3, 'up': 9, 'fact': 2, 'overall': 2, 'tremendously': 1, 'chemistry': 2, 'between': 5, 'ben': 1, 'affleck': 1, 'sandra': 1, 'bullock': 1, 'couldn': 4, 'understand': 4, 'why': 3, 'consider': 1, 'leaving': 1, 'wife': 1, 'supposedly': 1, 'knocked': 1, 'several': 2, 'moments': 1, 'need': 1, 'excruciatingly': 1, 'remake': 3, 'friends': 1, 'wedding': 1, 'disappointment': 2, 'cannot': 2, 'believe': 4, 'agreed': 1, 'stand': 1, 'fear': 1, 'losing': 2, 'q': 1, 'nobody': 3, 'network': 1, 'aired': 1, 'dribble': 1, 'watched': 4, 'putting': 2, 'imdb': 1, 'ratings': 1, 'awful': 10, 'get': 7, 'numbers': 1, 'cases': 1, 'such': 5, 'saw': 3, 'mirrormask': 1, 'last': 1, 'night': 2, 'unsatisfactory': 1, 'experience': 3, 'unfortunately': 3, 'inexperience': 1, 'direction': 2, 'meant': 1, 'passed': 1, 'way': 8, 'dramatic': 1, 'tension': 1, 'conflict': 1, 'central': 1, 'themes': 1, 'handled': 2, 'ineptly': 1, 'stereotypically': 1, 'depth': 4, 'imagination': 4, 'pretty': 6, 'pictures': 1, 'world': 2, 'flawed': 2, 'core': 2, 'following': 1, 'bunch': 2, 'high': 1, 'schoolers': 1, 'whine': 1, 'cry': 1, 'relate': 1, 'hell': 1, 'barely': 3, 'disaster': 2, 'editing': 1, 'confuses': 1, 'incredibly': 2, 'fish': 1, 'underwater': 1, 'repeated': 1, 'thousand': 1, 'times': 3, 'truly': 2, 'terrible': 8, 'worse': 8, 'possible': 2, 'redeemed': 1, 'mst3k': 1, 'fodder': 1, 'paid': 2, 'whatsoever': 3, 'again': 4, 'horrible': 3, 'gosh': 1, 'walk': 1, 'theatre': 1, 'few': 3, 'minutes': 6, 'relief': 1, 'hate': 2, 'yeah': 2, 'sucked': 3, 'storyline': 4, 'pillow': 1, 'girlfriend': 2, 'boyfriend': 1, 'keep': 2, 'through': 5, 'gone': 1, 'disliked': 2, 'thing': 6, 'tickets': 1, 'five': 1, 'dollars': 1, 'mad': 1, 'd': 4, '7': 1, '50': 1, 'identifies': 1, 'cardboard': 2, 'cutouts': 1, 'stereotypes': 2, 'predictably': 1, 'reverse': 1, 'ugly': 1, 'cartoon': 1, 'crafted': 2, 'paul': 2, 'haggis': 1, 'handle': 1, 'bold': 1, 'strokes': 1, 'storytelling': 1, 'painted': 1, 'crayons': 1, 'crash': 1, 'depressing': 2, 'provokes': 1, 'emotion': 1, 'teaches': 1, 'already': 1, 'know': 6, 'racism': 1, 'prejudice': 1, 'brain': 1, 'will': 4, 'attempt': 2, 'shut': 1, 'down': 1, 'primal': 1, 'impulse': 1, 'self': 2, 'preservation': 1, 'left': 1, 'shattered': 1, 'took': 2, 'fully': 1, 'recover': 1, 'joins': 1, 'revenge': 1, 'boogeyman': 1, 'zombiez': 1, 'hellish': 1, 'trinity': 1, 'mean': 4, 'distinction': 1, 'has': 8, 'redeeming': 2, 'features': 1, 'everything': 4, 'appalling': 2, 'artless': 1, 'endlessly': 1, 'presents': 1, 'us': 1, 'ugliest': 1, 'setting': 2, 'e': 1, 'beyond': 3, 'kids': 3, 'lead': 3, 'charisma': 2, 'free': 3, 'without': 2, 'merit': 1, 'akin': 1, 'torture': 4, 'll': 2, 'maybe': 2, 'reasonable': 1, 'explanation': 2, 'atrocity': 1, 'pleasant': 1, 'voyage': 1, 'discovery': 1, 'highly': 2, 'unrecommended': 1, 'premise': 3, 'sound': 2, 'suffered': 1, 'needed': 1, 'many': 4, 'close': 1, 'ups': 1, 'seemed': 1, 'drag': 1, 'heroes': 1, 'freedom': 2, 'definitely': 2, 'below': 2, 'remember': 2, 'ray': 1, 'charles': 1, 'being': 2, 'acted': 1, 'played': 1, 'himself': 1, 'legendary': 1, 'provided': 1, 'biographical': 1, 'material': 2, 'which': 7, 'goes': 1, 'musician': 1, 'hitchcock': 1, 'director': 3, 'ironically': 1, 'mostly': 1, 'find': 2, 'total': 1, 'secondly': 1, 'perfected': 1, 'thriller': 1, 'chase': 1, 'pandering': 1, 'sabotages': 1, 'hence': 1, 'whole': 3, 'certain': 1, 'rumbles': 1, 'machine': 2, 'desperately': 1, 'depending': 1, 'new': 2, 'usual': 1, 'logic': 1, 'flaws': 1, 'mishima': 1, 'uninteresting': 1, 'chilly': 1, 'unremarkable': 1, 'author': 1, 'living': 2, 'working': 1, 'abstruse': 1, 'culture': 1, 'reenactments': 1, 'hold': 1, 'attention': 1, 'emotionally': 1, 'adrift': 1, 'stagy': 1, 'sits': 1, 'soldiers': 1, 'singing': 2, 'masculinity': 1, 'pledge': 1, 'themselves': 1, 'hairsplitting': 1, 'purity': 1, 'admiration': 1, 'swords': 1, 'etc': 2, 'bore': 2, 'kill': 1, 'momentum': 1, 'quicker': 1, 'else': 2, 'schrader': 1, 'resume': 1, 'full': 3, 'amateurish': 1, 'first': 5, '80s': 1, 'loved': 1, 'fascinated': 1, 'dancing': 1, 'recently': 2, 'dvd': 2, 'completely': 3, 'struck': 1, 'contained': 1, 'holes': 1, 'inconsistencies': 1, 'lot': 3, 'horrid': 1, 'realistic': 1, 'she': 1, 'gotten': 1, 'ballet': 1, 'repertory': 1, 'pathetic': 4, 'developments': 1, 'lacked': 2, 'woa': 1, 'talk': 3, 'sappiest': 1, 'what': 6, 'unwatchable': 1, 'tell': 1, 'talent': 2, 'gave': 2, 'action': 2, 'check': 1, 'filmography': 1, 'site': 1, 'chance': 1, 'intentions': 1, 'might': 2, 'greatest': 1, 'master': 1, 'theme': 1, 'undertone': 1, 'fifties': 1, 'existential': 1, 'weariness': 1, 'aerial': 1, 'ought': 1, 'thrilled': 2, 'both': 4, 'senses': 2, 'deeply': 1, 'care': 2, 'regrettably': 1, 'fails': 2, 'visual': 2, 'interest': 1, 'drama': 2, 'expression': 1, 'celebration': 1, 'patriotism': 1, 'underlines': 1, 'narrative': 2, 'actress': 1, 'been': 6, 'used': 4, 'june': 1, 'allison': 1, 'yet': 1, 'plain': 1, 'called': 1, 'soundtrack': 1, 'concert': 1, 'sequences': 1, 'nice': 1, 'cheap': 5, 'trash': 2, 'considering': 2, 'ridiculousness': 1, 'came': 2, 'angry': 1, 'spoilers': 2, 'said': 2, 'surface': 1, 'superbly': 1, 'stunning': 1, 'fx': 1, 'state': 1, 'conceptually': 1, 'show': 10, 'offers': 1, 'everybody': 1, 'fantasy': 1, 'fans': 1, 'single': 1, 'sour': 1, 'note': 2, 'wise': 1, 'either': 1, 'surprisingly': 2, 'solid': 2, 'casting': 2, 'here': 8, 'considered': 1, 'job': 1, 'done': 2, 'thanks': 1, 'released': 1, 'mexican': 1, 'less': 3, 'understood': 1, 'matter': 1, 'identified': 1, 'should': 4, 'felt': 2, 'ranks': 1, 'noir': 2, 'crime': 1, 'incredible': 1, 'belmondo': 1, 'lino': 1, 'ventura': 1, 'given': 1, 'complex': 1, 'psychological': 1, 'portrayals': 1, 'detailing': 1, 'loyalty': 1, 'treachery': 1, 'hope': 1, 'tremendous': 1, 'melville': 1, 'journey': 1, 'eyes': 2, 'soul': 1, 'child': 1, 'water': 1, 'manages': 1, 'transcend': 1, 'limitations': 1, 'indie': 1, 'continually': 1, 'subverting': 1, 'expectations': 1, 'emerge': 1, 'intense': 1, 'gripping': 1, 'crocdodile': 1, 'indeed': 1, 'website': 1, 'believed': 1, 'crocs': 1, 'swamp': 1, 'location': 2, 'fabulous': 1, 'thoroughly': 1, 'christopher': 1, 'eccleston': 1, 'control': 2, 'tardis': 1, 'continuation': 1, 'trying': 2, 'turn': 2, 'day': 1, 'disturbing': 1, 'memories': 1, 'succeeded': 1, 'places': 2, 'forced': 1, 'pi': 1, 'off': 5, 'started': 4, 'jerky': 2, 'camerawork': 1, 'theater': 1, 'going': 6, 'sick': 2, 'summary': 1, 'witticisms': 1, 'weren': 2, 'witty': 1, 'billy': 1, 'bob': 1, 'rise': 1, 'above': 1, 'rating': 4, 'finale': 1, 'possibly': 1, ...}
b) Now counting Positive words and Omitting words having reviews less than or equal to 5
positive_words=dict()
ID=0
for i in train_data.Reviews:
if(train_data.iloc[ID].Sentiment == 1):
ID=ID+1
for j in i.split():
j=j.lower()
if(j in positive_words.keys()):
positive_words[j].append(ID)
else:
positive_words[j]=list()
positive_words[j].append(ID)
else:
ID=ID+1
for i in positive_words:
positive_words[i] = len(set(positive_words[i]))
#pop_words(positive_words)
positive_words
{'the': 93, 'best': 10, 'scene': 1, 'in': 37, 'movie': 35, 'was': 36, 'when': 8, 'gerardo': 1, 'is': 49, 'trying': 1, 'to': 48, 'find': 4, 'a': 78, 'song': 2, 'that': 31, 'keeps': 1, 'running': 1, 'through': 3, 'his': 8, 'head': 1, 'saw': 4, 'today': 3, 'and': 88, 'thought': 4, 'it': 57, 'good': 14, 'effort': 1, 'messages': 1, 'for': 19, 'kids': 3, 'loved': 4, 'casting': 3, 'of': 60, 'jimmy': 2, 'buffet': 1, 'as': 18, 'science': 1, 'teacher': 1, 'those': 3, 'baby': 1, 'owls': 1, 'were': 10, 'adorable': 4, 'showed': 2, 'lot': 1, 'florida': 1, 'at': 10, 's': 30, 'made': 5, 'look': 6, 'very': 11, 'appealing': 1, 'songs': 1, 'muppets': 1, 'so': 15, 'hilarious': 4, 'cool': 4, 'this': 52, 'right': 3, 'on': 10, 'case': 1, 'delivers': 2, 'everything': 2, 'almost': 2, 'your': 3, 'face': 2, 'review': 1, 'long': 3, 'overdue': 1, 'since': 2, 'i': 63, 'consider': 2, 'tale': 1, 'two': 1, 'sisters': 1, 'be': 11, 'single': 3, 'greatest': 2, 'film': 33, 'ever': 6, 'll': 4, 'put': 1, 'gem': 1, 'up': 4, 'against': 1, 'any': 3, 'terms': 2, 'screenplay': 1, 'cinematography': 4, 'acting': 5, 'post': 1, 'production': 2, 'editing': 2, 'directing': 1, 'or': 5, 'other': 8, 'aspect': 1, 'making': 4, 'practically': 1, 'perfect': 3, 'all': 8, 'them': 7, 'true': 3, 'masterpiece': 2, 'sea': 1, 'faux': 2, 'masterpieces': 1, 'if': 8, 'first': 2, 've': 2, 'given': 2, '10': 9, 'years': 5, 'there': 8, 'needed': 1, 'word': 1, 'mouth': 1, 'promote': 1, 'overall': 3, 'interesting': 6, 'provoking': 1, 'plus': 2, 'well': 12, 'paced': 1, 'suited': 1, 'its': 7, 'relatively': 1, 'short': 2, 'run': 2, 'time': 7, 'give': 5, 'one': 11, 'gave': 2, 'wind': 3, 'lion': 3, 'written': 1, 'superbly': 1, 'acted': 1, 'classic': 3, 'actually': 5, 'turned': 1, 'out': 15, 'pretty': 2, 'decent': 2, 'far': 1, 'b': 1, 'list': 1, 'horror': 1, 'suspense': 2, 'films': 3, 'go': 4, 'definitely': 4, 'worth': 7, 'checking': 3, 'manna': 1, 'from': 10, 'heaven': 1, 'terrific': 3, 'both': 3, 'predictable': 1, 'unpredictable': 1, 'same': 2, 'scenes': 2, 'are': 19, 'often': 2, 'funny': 10, 'occasionally': 4, 'touching': 2, 'characters': 9, 'evaluate': 1, 'their': 3, 'lives': 1, 'where': 3, 'they': 5, 'going': 2, 'cast': 8, 'veteran': 1, 'actors': 4, 'more': 12, 'than': 5, 'just': 7, 'nostalgia': 1, 'trip': 1, 'ursula': 1, 'burton': 1, 'portrayal': 2, 'nun': 1, 'with': 19, 'fun': 2, 'nuns': 1, 'church': 2, 'you': 9, 'looking': 1, 'some': 6, 'music': 3, 'including': 3, 'shirley': 1, 'jones': 1, 'rendition': 1, 'way': 6, 'tonight': 1, 'an': 13, 'uplifting': 2, 'ending': 4, 'try': 1, 'don': 6, 't': 13, 'think': 7, 'will': 7, 'disappointed': 1, 'only': 2, 'thing': 2, 'really': 14, 'watching': 4, 'scenery': 1, 'house': 2, 'because': 5, 'beautiful': 6, 'but': 21, 'writing': 3, 'fresh': 1, 'bold': 1, 'helps': 2, 'along': 2, 'maybe': 1, 'idiot': 1, 'savant': 1, 'sister': 1, 'could': 5, 'have': 9, 'been': 2, 'played': 5, 'better': 3, 'real': 5, 'joy': 4, 'watch': 4, 'applause': 1, 'should': 1, 'prelude': 1, 'however': 2, 'liked': 3, 'great': 12, 'by': 7, 'director': 2, 'had': 7, 'edge': 2, 'seat': 1, 'somewhat': 1, 'afraid': 1, 'car': 1, 'end': 2, 'night': 1, 'nice': 4, 'too': 5, 'd': 2, 'advise': 1, 'anyone': 8, 'see': 12, 'brilliant': 5, 'much': 2, 'rocked': 1, 'my': 12, 'world': 3, 'certainly': 1, 'must': 4, 'no': 1, 'social': 1, 'physical': 1, 'outlets': 1, 'cult': 1, 'viewing': 1, 'sharing': 1, 'others': 1, 'also': 5, 'treat': 1, 'anthony': 1, 'quinn': 1, 'playing': 3, 'crazy': 2, 'horse': 1, 'still': 3, 'do': 6, 'like': 6, 'empowerment': 1, 'women': 1, 'not': 5, 'enough': 2, 'movies': 5, 'excellent': 4, 'performance': 4, 'ms': 1, 'garbo': 1, 'who': 9, 'off': 2, 'bat': 1, 'her': 6, 'talents': 1, 'carry': 1, 'over': 4, 'silent': 3, 'era': 1, 'wanted': 2, 'work': 4, 'netflix': 1, 'doesn': 3, 'seem': 1, 'stocking': 1, 'renowned': 1, 'screenwriter': 2, 'frances': 1, 'marion': 1, 'hasn': 1, 'missed': 1, 'step': 2, 'sound': 2, 'love': 7, '1': 2, 'bad': 2, '0': 1, 'ones': 1, 'received': 1, 'quality': 1, 'age': 2, 'john': 3, 'wayne': 1, 'did': 5, 'incredible': 1, 'job': 4, 'being': 3, 'young': 1, 'industry': 1, 'screen': 3, 'presence': 1, 'shined': 1, 'even': 4, 'though': 5, 'senior': 1, 'him': 4, 'older': 1, 'how': 4, 'enjoy': 1, 'seen': 3, 'half': 1, 'boring': 2, 'self': 2, 'indulgent': 2, 'piece': 2, 'junk': 1, 'probably': 3, 'would': 3, 'hadn': 1, 'spent': 1, 'most': 6, 'showcasing': 1, 'own': 4, 'art': 5, 'which': 4, 'isn': 1, 'noteworthy': 2, 'another': 2, 'didn': 1, 'character': 4, 'got': 3, 'punched': 1, 'gallon': 1, 'blood': 1, 'spew': 1, 'forth': 1, 'soon': 1, 'after': 5, 'jamie': 1, 'foxx': 1, 'absolutely': 3, 'ray': 1, 'charles': 1, 'simply': 1, 'genius': 2, 'he': 7, 'owns': 1, 'spacek': 1, 'owned': 1, 'coal': 1, 'miner': 1, 'daughter': 1, 'quaid': 1, 'balls': 1, 'fire': 1, 'such': 4, 'highly': 3, 'entertaining': 3, 'angles': 1, 'features': 2, 'outlandish': 1, 'array': 1, 'memorable': 1, 'psychotic': 1, 'lovable': 1, 'nuts': 2, 'our': 3, 'enjoyment': 1, 'we': 4, 'get': 4, 'around': 1, 'play': 3, 'games': 1, 'dangerous': 2, 'has': 10, 'sweet': 1, 'moments': 1, 'telephone': 1, 'repair': 1, 'man': 4, 'reactions': 2, 'bitchy': 1, 'boss': 1, 'truly': 2, 'genuine': 1, 'tremendously': 1, 'smart': 2, 'twist': 1, 'television': 1, 'series': 1, 'writers': 1, 'smack': 1, 'actresses': 1, 'bonus': 1, 'show': 3, 'these': 1, 'say': 5, 'taped': 1, 'episodes': 1, 'myself': 3, 'again': 3, 'now': 2, 'know': 4, 'why': 2, 'exactly': 2, 'what': 7, 'about': 7, 'latched': 1, 'endearing': 1, 'become': 2, 'special': 4, 'part': 2, 'family': 4, 'memories': 2, 'totally': 3, 'recommend': 3, 'likes': 2, 'wholesome': 2, 'things': 2, 'four': 1, 'themselves': 2, 'into': 3, 'shows': 1, 'strong': 1, 'sibling': 1, 'bond': 1, 'each': 3, 'action': 1, 'less': 1, 'unneeded': 1, 'controversy': 1, 'solid': 1, 'damian': 1, 'talented': 1, 'versatile': 1, 'many': 4, 'ways': 1, 'portraying': 2, 'different': 4, 'cutting': 1, 'am': 1, 'pleased': 1, 'modern': 1, 'day': 2, 'letting': 1, 'passion': 1, 'drive': 1, 'taking': 1, 'us': 1, 'audience': 2, 'elias': 1, 'koteas': 1, 'jack': 1, 'palance': 1, 'roles': 1, 'angelina': 1, 'hot': 1, 'gets': 1, 'naked': 1, 'billy': 2, 'drago': 2, 'appears': 1, 'usual': 2, 'cameo': 1, 'sven': 1, 'ole': 1, 'thorsen': 1, 'make': 2, 'enjoyable': 2, 'budget': 2, 'awesome': 1, 'bought': 1, 'ebay': 1, 'story': 3, 'line': 1, 'poler': 1, 'bear': 1, 'kinda': 2, 'cute': 1, 'question': 1, 'fort': 1, 'steele': 1, 'ask': 1, 'away': 1, 'wonderful': 4, 'parts': 4, 'literally': 1, 'full': 1, 'wonder': 2, 'excerpts': 1, 'works': 2, 'sets': 2, 'especially': 1, 'designed': 1, 'camera': 4, 'amazing': 1, 'stylized': 1, 'effective': 1, 'used': 1, 'exemplars': 1, 'set': 1, 'designer': 1, 'stories': 1, 'powerful': 1, 'explorations': 1, 'nature': 1, 'learn': 1, 'artist': 1, 'interested': 1, 'poetry': 1, 'theater': 2, 'politics': 1, 'japanese': 1, 'history': 3, 'here': 4, 'rendering': 1, 'america': 1, 'imperial': 1, 'makes': 3, 'values': 1, 'faultless': 1, 'photography': 1, 'composition': 2, 'underappreciated': 1, 'brian': 1, 'keith': 1, 'bully': 1, 'teddy': 1, 'vivid': 1, 'told': 1, 'largely': 1, 'eyes': 2, 'son': 1, 'every': 6, 'member': 1, 'can': 4, 'identify': 1, 'whether': 1, 'sean': 1, 'connery': 1, 'noble': 1, 'brigand': 1, 'candace': 1, 'bergen': 1, 'feisty': 1, 'heroine': 1, 'huston': 1, 'wily': 1, 'hay': 1, 'steve': 2, 'kanaly': 1, 'spiffy': 1, 'radiant': 1, 'ruthless': 1, 'lieutenant': 1, 'roosevelt': 1, 'big': 3, 'stick': 1, 'high': 1, 'adventure': 1, 'robert': 2, 'ryans': 1, 'portrayed': 1, 'someone': 1, 'father': 1, 'schizophrenic': 1, 'life': 2, 'although': 2, 'never': 2, 'murdered': 2, 'affected': 1, 'during': 2, 'second': 2, 'war': 1, 'worse': 1, 'having': 2, 'humour': 2, 'apt': 1, 'mother': 1, 'brother': 1, 'ryan': 1, 'type': 1, 'imitation': 1, 'individual': 1, 'impressed': 2, 'non': 1, 'linear': 1, 'narration': 1, 'thus': 1, 'flashbacks': 1, 'articulated': 1, 'quite': 2, 'monica': 1, 'bellucci': 1, 'commentary': 1, 'undoubtedly': 1, 'seeing': 3, 'people': 2, 'timers': 1, 'popular': 2, 'cinema': 3, 'plenty': 2, 'laughs': 1, 'feel': 1, 'felt': 1, 'came': 2, 'northern': 1, 'positive': 1, 'community': 1, 'represents': 1, 'rather': 1, 'enjoyed': 4, 'created': 2, 'unique': 1, 'feeling': 2, 'vivian': 1, 'schilling': 1, 'script': 3, '95': 1, 'garbage': 2, 'theatres': 1, 'role': 3, 'screamy': 1, 'masculine': 1, 'casted': 1, 'ready': 1, 'european': 1, 'throwback': 1, 'student': 1, '1980': 1, 'experiences': 1, 'living': 1, 'abroad': 1, 'interacting': 1, 'nationalities': 1, 'circumstances': 1, 'slightly': 1, 'angel': 1, 'scamp': 1, 'little': 2, 'yelps': 1, 'hes': 1, 'scared': 1, 'funniest': 1, 'caught': 1, 'under': 2, 'curtain': 1, 'singing': 1, 'ive': 1, 'before': 3, 'coming': 1, 'edition': 1, 'june': 1, '20': 1, 'cover': 1, 'underneath': 1, 'lid': 1, 'cant': 1, 'explain': 1, 'romantic': 1, 'charming': 1, 'junkyard': 1, 'dogs': 1, 'something': 2, 'laughed': 1, 'buy': 1, 'comes': 1, 'new': 1, 'premise': 1, 'always': 3, 'nut': 1, 'bag': 1, 'side': 1, 'note': 1, 'stephen': 1, 'mchattie': 1, 'lance': 1, 'hendrikson': 1, 'flick': 1, 'together': 1, 'talk': 1, 'raging': 1, 'cheekbones': 1, 'soundtrack': 2, 'wasn': 2, 'terrible': 1, 'either': 1, 'oy': 1, 'vey': 1, 'scale': 2, 'jobs': 1, 'amusing': 1, 'last': 1, '15': 1, 'minutes': 1, 'armand': 1, 'assante': 1, 'cable': 1, 'company': 1, 'summary': 1, 'sounded': 1, 'watched': 2, 'twice': 1, 'already': 1, 'believe': 3, 'tying': 1, 'loose': 1, 'ends': 1, '8': 1, 'score': 1, 'mostly': 2, 'plot': 2, 'won': 1, 'spoilers': 1, 'want': 2, 'deeply': 1, 'knew': 2, 'come': 1, 'gifted': 1, 'actor': 3, 'share': 1, 'ups': 2, 'down': 1, 'starring': 1, 'jaclyn': 1, 'smith': 1, 'god': 1, '12': 1, 'ago': 1, 'contained': 1, 'star': 1, 'opened': 1, 'haven': 1, 'kind': 1, 'length': 1, 'minute': 1, 'rate': 1, 'lifetime': 1, 'does': 2, 'air': 1, 'knows': 1, 'store': 1, 'sells': 1, 'let': 1, 'me': 4, 'loads': 1, 'understatement': 1, 'black': 3, 'comedy': 1, 'few': 1, 'remember': 1, 'creates': 1, 'universe': 1, 'fascinating': 1, 'hope': 2, 'team': 1, 'behind': 1, 'continue': 1, 'weird': 1, 'style': 1, 'forgot': 1, 'superb': 1, 'trond': 1, 'fausa': 1, 'aurv': 1, 'g': 1, 'bothersome': 1, 'understand': 1, 'doing': 1, 'humorous': 2, 'comment': 1, '2006': 1, 'found': 2, 'move': 1, 'initially': 1, 'local': 1, 'sites': 1, 'filmed': 2, 'buffalo': 1, 'intrigued': 1, 'later': 1, 'lost': 2, 'power': 1, 'entire': 1, 'applauded': 1, 'conclusion': 1, 'left': 1, 'lilt': 1, 'heart': 1, 'human': 1, 'race': 1, 'duris': 1, 'appearance': 2, 'gives': 1, 'fine': 2, 'rest': 1, 'views': 1, 'barcelona': 1, 'famed': 1, 'gaudi': 1, 'towers': 1, 'martin': 1, 'middle': 1, 'aged': 1, 'upper': 1, 'class': 1, 'uptight': 1, 'white': 3, 'guy': 1, 'add': 1, 'betty': 1, 'jean': 1, 'everyone': 1, 'steamboat': 1, 'willie': 1, 'amazingly': 1, 'important': 1, 'mickey': 4, 'mouse': 1, 'following': 1, 'plane': 1, 'earlier': 1, 'year': 1, 'famous': 1, 'ground': 1, 'breaking': 1, 'while': 1, 'yet': 2, 'hear': 1, 'speak': 1, 'tons': 1, 'effects': 2, 'throughout': 1, 'take': 1, 'granted': 1, 'huge': 1, 'crowd': 2, 'pleaser': 2, '1928': 1, '25': 1, 'amazed': 1, 'timeless': 1, 'turkey': 1, 'straw': 1, 'imaginative': 1, 'cruel': 1, 'clever': 1, 'ranks': 1, 'among': 1, '80': 1, 'original': 2, 'body': 1, 'soul': 1, '1947': 1, 'garfield': 1, 'ann': 1, 'revere': 1, 'lilli': 1, 'plmer': 1, 'william': 1, 'conrad': 1, 'canada': 1, 'lee': 1, 'cinematographers': 1, 'grace': 1, 'james': 1, 'wong': 1, 'howe': 1, 'okay': 1, 'fair': 1, 'critic': 1, 'credit': 1, 'due': 2, 'creature': 1, 'gotta': 1, 'close': 2, 'slimy': 1, 'drooling': 1, 'teeth': 1, 'sole': 1, 'bright': 1, 'spot': 1, 'jonah': 1, 'hill': 1, 'unrecognizable': 1, 'fans': 1, 'recent': 1, 'superbad': 1, 'amount': 1, 'weight': 1, 'interim': 1, '90': 2, 'child': 3, 'proudly': 1, 'classical': 1, 'wb': 1, 'cartoons': 2, 'tiny': 1, 'toons': 1, 'kept': 1, 'vibe': 1, 'delivered': 1, 'underrated': 1, 'murky': 1, 'episode': 1, 'product': 1, 'related': 1, 'easily': 1, 'none': 1, 'cartoon': 1, 'laugh': 1, 'tender': 1, 'getting': 1, 'dark': 1, 'sitcoms': 1, 'oriented': 1, 'teenagers': 1, 'peculiarity': 1, 'lead': 2, 'deserved': 1, 'called': 1, 'perabo': 1, 'energy': 1, 'level': 1, 'obviously': 1, 'comfortable': 1, 'front': 1, 'pitch': 1, 'done': 1, 'longer': 1, 'goes': 1, 'surprised': 1, 'care': 1, 'instant': 1, 'catchy': 1, 'credits': 1, 'miss': 1, 'masterful': 1, 'themes': 1, 'simmering': 1, 'boiling': 1, 'warts': 1, 'study': 1, 'poet': 1, 'bohemian': 1, 'wartime': 1, 'span': 1, 'aerial': 1, 'bombardments': 1, 'london': 1, 'outward': 1, 'tranquillity': 1, 'welsh': 1, 'coastal': 1, 'retreat': 1, 'borderlines': 1, 'between': 1, 'friendship': 1, 'lust': 1, 'dedication': 1, 'experience': 1, 'versus': 1, 'practical': 1, 'concerns': 1, 'jealousy': 1, 'rivalry': 1, 'cowardice': 1, 'egotism': 1, ...}
Calculating probabilities of positive a well as negative words before manipulations
train_data_y=train_data['Sentiment']
def calc_prob(senti_data):
probab= []
s0= sum(train_data_y == 0)
probab.append(s0/(train_data_y.size))
s1= sum(train_data_y == 1)
probab.append(s1/(train_data_y.size))
return probab
probab=calc_prob(train_data_y)
print ("Probability before manupulation ---- Positive review ---- ", probab[1])
print ("Probability before manupulation ---- negative review ---- ", probab[0])
Probability before manupulation ---- Positive review ---- 0.41964285714285715 Probability before manupulation ---- negative review ---- 0.5803571428571429
#pop_words(wordlist)
#wordlist
Probability of occurence of 'the' in wordlist, positive and negative list
#p["the"]
print(wordlist["the"]/len(train_data))
#p["the"|Nositive]
print(positive_words["the"]/(len(train_data)/2))
#P["the"|Negative]
print(negative_words["the"]/(len(train_data)/2))
0.49107142857142855 0.41517857142857145 0.5669642857142857
Removing stop words like 'these','those','is','for' etc which has nothing to do with reviews
import io
from nltk.corpus import stopwords
def stop_words_removal(list1):
stop_words= set(stopwords.words("english"))
for i in list1.copy():
if(i in stop_words):
list1.pop(i)
stop_words_removal(wordlist)
stop_words_removal(positive_words)
stop_words_removal(negative_words)
pop_words(wordlist)
pop_words(positive_words)
pop_words(negative_words)
print(wordlist,"\n",positive_words,"\n",negative_words)
{'movie': 85, 'man': 8, 'characters': 19, 'half': 6, 'black': 7, 'camera': 8, 'even': 26, 'acting': 21, 'plot': 18, 'almost': 8, 'little': 13, 'music': 6, 'best': 15, 'find': 6, 'art': 7, 'saw': 7, 'thought': 9, 'good': 24, 'kids': 6, 'predictable': 6, 'made': 14, 'look': 11, 'everything': 6, 'see': 24, 'film': 66, 'ever': 13, 'cinematography': 6, 'directing': 6, '1': 8, 'think': 12, 'every': 10, 'enough': 8, 'films': 9, 'anyone': 12, 'quite': 7, 'simply': 6, 'beautiful': 8, 'certainly': 6, '0': 6, 'part': 7, 'one': 21, 'love': 12, 'waste': 7, 'kind': 6, 'actually': 6, 'time': 18, 'say': 9, 'character': 10, 'make': 14, 'would': 17, 'also': 14, 'give': 7, 'funny': 13, 'much': 15, 'better': 7, 'people': 6, 'like': 20, 'story': 9, 'real': 12, 'work': 14, 'scenes': 8, 'screen': 6, 'seen': 10, 'go': 8, 'life': 7, 'recommend': 7, 'excellent': 6, 'cast': 14, 'totally': 8, '10': 16, 'enjoyed': 6, 'annoying': 6, 'feeling': 6, 'absolutely': 7, 'actors': 11, 'well': 20, 'dialogue': 6, 'really': 26, 'found': 6, 'great': 18, 'things': 11, 'worth': 8, 'though': 9, 'movies': 10, 'could': 16, 'writing': 11, 'piece': 6, 'big': 6, 'first': 7, 'years': 7, 'interesting': 6, 'pretty': 8, 'definitely': 6, 'script': 14, 'nothing': 7, 'going': 8, 'way': 14, 'ending': 8, 'watch': 11, 'bad': 33, 'thing': 8, 'watching': 12, 'never': 8, 'played': 6, 'boring': 6, 'stupid': 12, 'believe': 7, 'watched': 6, 'awful': 10, 'get': 11, 'terrible': 9, 'worse': 9, 'minutes': 7, 'know': 10, 'many': 8, 'probably': 6, 'show': 13} {'best': 10, 'movie': 35, 'good': 14, 'look': 6, 'film': 33, 'ever': 6, '10': 9, 'interesting': 6, 'well': 12, 'time': 7, 'one': 11, 'worth': 7, 'funny': 10, 'characters': 9, 'cast': 8, 'way': 6, 'think': 7, 'really': 14, 'beautiful': 6, 'great': 12, 'anyone': 8, 'see': 12, 'like': 6, 'love': 7, 'every': 6} {'movie': 50, 'characters': 10, 'even': 22, 'acting': 16, 'plot': 16, 'almost': 6, 'little': 11, 'see': 12, 'film': 33, '1': 6, 'enough': 6, 'films': 6, 'one': 10, 'waste': 7, 'good': 10, 'time': 11, 'character': 6, 'make': 12, 'would': 14, 'also': 9, 'much': 13, 'like': 14, 'story': 6, 'real': 7, 'work': 10, 'scenes': 6, 'ever': 7, 'seen': 7, 'cast': 6, '10': 7, 'annoying': 6, 'actors': 7, 'well': 8, 'dialogue': 6, 'really': 12, 'great': 6, 'things': 9, 'could': 11, 'writing': 8, 'made': 9, 'script': 11, 'nothing': 7, 'watch': 7, 'bad': 31, 'never': 6, 'watching': 8, 'stupid': 12, 'awful': 10, 'get': 7, 'way': 8, 'pretty': 6, 'terrible': 8, 'worse': 8, 'minutes': 6, 'thing': 6, 'know': 6, 'show': 10, 'going': 6}
Probability of all word list
for words in wordlist:
print(words , "-" , wordlist[words], " Probability of ", words, "is : ", wordlist[words]/len(train_data))
movie - 85 Probability of movie is : 0.18973214285714285 man - 8 Probability of man is : 0.017857142857142856 characters - 19 Probability of characters is : 0.04241071428571429 half - 6 Probability of half is : 0.013392857142857142 black - 7 Probability of black is : 0.015625 camera - 8 Probability of camera is : 0.017857142857142856 even - 26 Probability of even is : 0.05803571428571429 acting - 21 Probability of acting is : 0.046875 plot - 18 Probability of plot is : 0.04017857142857143 almost - 8 Probability of almost is : 0.017857142857142856 little - 13 Probability of little is : 0.029017857142857144 music - 6 Probability of music is : 0.013392857142857142 best - 15 Probability of best is : 0.033482142857142856 find - 6 Probability of find is : 0.013392857142857142 art - 7 Probability of art is : 0.015625 saw - 7 Probability of saw is : 0.015625 thought - 9 Probability of thought is : 0.020089285714285716 good - 24 Probability of good is : 0.05357142857142857 kids - 6 Probability of kids is : 0.013392857142857142 predictable - 6 Probability of predictable is : 0.013392857142857142 made - 14 Probability of made is : 0.03125 look - 11 Probability of look is : 0.024553571428571428 everything - 6 Probability of everything is : 0.013392857142857142 see - 24 Probability of see is : 0.05357142857142857 film - 66 Probability of film is : 0.14732142857142858 ever - 13 Probability of ever is : 0.029017857142857144 cinematography - 6 Probability of cinematography is : 0.013392857142857142 directing - 6 Probability of directing is : 0.013392857142857142 1 - 8 Probability of 1 is : 0.017857142857142856 think - 12 Probability of think is : 0.026785714285714284 every - 10 Probability of every is : 0.022321428571428572 enough - 8 Probability of enough is : 0.017857142857142856 films - 9 Probability of films is : 0.020089285714285716 anyone - 12 Probability of anyone is : 0.026785714285714284 quite - 7 Probability of quite is : 0.015625 simply - 6 Probability of simply is : 0.013392857142857142 beautiful - 8 Probability of beautiful is : 0.017857142857142856 certainly - 6 Probability of certainly is : 0.013392857142857142 0 - 6 Probability of 0 is : 0.013392857142857142 part - 7 Probability of part is : 0.015625 one - 21 Probability of one is : 0.046875 love - 12 Probability of love is : 0.026785714285714284 waste - 7 Probability of waste is : 0.015625 kind - 6 Probability of kind is : 0.013392857142857142 actually - 6 Probability of actually is : 0.013392857142857142 time - 18 Probability of time is : 0.04017857142857143 say - 9 Probability of say is : 0.020089285714285716 character - 10 Probability of character is : 0.022321428571428572 make - 14 Probability of make is : 0.03125 would - 17 Probability of would is : 0.03794642857142857 also - 14 Probability of also is : 0.03125 give - 7 Probability of give is : 0.015625 funny - 13 Probability of funny is : 0.029017857142857144 much - 15 Probability of much is : 0.033482142857142856 better - 7 Probability of better is : 0.015625 people - 6 Probability of people is : 0.013392857142857142 like - 20 Probability of like is : 0.044642857142857144 story - 9 Probability of story is : 0.020089285714285716 real - 12 Probability of real is : 0.026785714285714284 work - 14 Probability of work is : 0.03125 scenes - 8 Probability of scenes is : 0.017857142857142856 screen - 6 Probability of screen is : 0.013392857142857142 seen - 10 Probability of seen is : 0.022321428571428572 go - 8 Probability of go is : 0.017857142857142856 life - 7 Probability of life is : 0.015625 recommend - 7 Probability of recommend is : 0.015625 excellent - 6 Probability of excellent is : 0.013392857142857142 cast - 14 Probability of cast is : 0.03125 totally - 8 Probability of totally is : 0.017857142857142856 10 - 16 Probability of 10 is : 0.03571428571428571 enjoyed - 6 Probability of enjoyed is : 0.013392857142857142 annoying - 6 Probability of annoying is : 0.013392857142857142 feeling - 6 Probability of feeling is : 0.013392857142857142 absolutely - 7 Probability of absolutely is : 0.015625 actors - 11 Probability of actors is : 0.024553571428571428 well - 20 Probability of well is : 0.044642857142857144 dialogue - 6 Probability of dialogue is : 0.013392857142857142 really - 26 Probability of really is : 0.05803571428571429 found - 6 Probability of found is : 0.013392857142857142 great - 18 Probability of great is : 0.04017857142857143 things - 11 Probability of things is : 0.024553571428571428 worth - 8 Probability of worth is : 0.017857142857142856 though - 9 Probability of though is : 0.020089285714285716 movies - 10 Probability of movies is : 0.022321428571428572 could - 16 Probability of could is : 0.03571428571428571 writing - 11 Probability of writing is : 0.024553571428571428 piece - 6 Probability of piece is : 0.013392857142857142 big - 6 Probability of big is : 0.013392857142857142 first - 7 Probability of first is : 0.015625 years - 7 Probability of years is : 0.015625 interesting - 6 Probability of interesting is : 0.013392857142857142 pretty - 8 Probability of pretty is : 0.017857142857142856 definitely - 6 Probability of definitely is : 0.013392857142857142 script - 14 Probability of script is : 0.03125 nothing - 7 Probability of nothing is : 0.015625 going - 8 Probability of going is : 0.017857142857142856 way - 14 Probability of way is : 0.03125 ending - 8 Probability of ending is : 0.017857142857142856 watch - 11 Probability of watch is : 0.024553571428571428 bad - 33 Probability of bad is : 0.07366071428571429 thing - 8 Probability of thing is : 0.017857142857142856 watching - 12 Probability of watching is : 0.026785714285714284 never - 8 Probability of never is : 0.017857142857142856 played - 6 Probability of played is : 0.013392857142857142 boring - 6 Probability of boring is : 0.013392857142857142 stupid - 12 Probability of stupid is : 0.026785714285714284 believe - 7 Probability of believe is : 0.015625 watched - 6 Probability of watched is : 0.013392857142857142 awful - 10 Probability of awful is : 0.022321428571428572 get - 11 Probability of get is : 0.024553571428571428 terrible - 9 Probability of terrible is : 0.020089285714285716 worse - 9 Probability of worse is : 0.020089285714285716 minutes - 7 Probability of minutes is : 0.015625 know - 10 Probability of know is : 0.022321428571428572 many - 8 Probability of many is : 0.017857142857142856 probably - 6 Probability of probably is : 0.013392857142857142 show - 13 Probability of show is : 0.029017857142857144
Conditional Probability of all the Positive words
train_x=train_data['Reviews']
def cp(train_data,train_x):
positive_conditional={};
negative_conditional={};
for word in positive_words:
positive_conditional[word]=positive_words[word]/len(train_data.loc[train_data.Sentiment==1].Reviews)
print(word ,"-" , positive_words[word]," Conditional probability of ",word ,"having positive sentiment is ", positive_conditional[word])
for words in negative_words:
negative_conditional[words]=negative_words[words]/len(train_data.loc[train_data.Sentiment==0].Reviews)
print(words ,"-" , negative_words[words]," Conditional probability of ",words," having positive sentiment is ", negative_conditional[words])
return positive_conditional,negative_conditional
def f1score(r,pred):
flag=0;
for i in range(len(r)):
if(r[i]==pred[i]):
flag+=1
accuracy=flag/len(r)*100
return accuracy
As we have all the information available,Now we will perform prediction to check accuracy
import io
from nltk.corpus import stopwords
def stop_words_rem(list1):
stop_words= set(stopwords.words("english"))
for i in list1:
if(i in stop_words):
list1.replace(i,"")
def prediction(data,ans,negative_conditional,positive_conditional):
Sentiment_prediction=[]
for statement in data:
state=stop_words_rem(statement)
pos_flag=1
neg_flag=1
for word in statement.split(" "):
word=word.lower()
if word not in positive_conditional.keys():
positive_conditional[word]=0
if word not in negative_conditional.keys():
negative_conditional[word]=0
pos_flag*=positive_conditional[word]
neg_flag*=negative_conditional[word]
pos_probab=pos_flag*probab[1]
neg_probab=neg_flag*probab[0]
if(pos_probab>neg_probab):
Sentiment_prediction.append(1)
else:
Sentiment_prediction.append(0)
return f1score(ans.tolist(),Sentiment_prediction)
print("Accuracy of test data",prediction(test_data['Reviews'],test_data['Sentiment'],negative_conditional,positive_conditional))
#print(prediction(train_data['Reviews'],train_data['Sentiment'],negative_conditional,positive_conditional))
#print(prediction(dev_data['Reviews'],dev_data['Sentiment'],negative_conditional,positive_conditional))
Accuracy of test data 71.11111111111111
Imeplementing five fold cross validation
def k_fold(d):
d_split=np.array_split(d ,5)
s_review=np.array_split(d['Reviews'],5)
s_sentiment=np.array_split(d['Sentiment'],5)
k_acc=[]
d1=np.concatenate((d_split[1],d_split[2],d_split[3],d_split[4]))
x1=np.concatenate((s_review[1],s_review[2],s_review[3],s_review[4]))
dfd1=pd.DataFrame(d1,columns=['ID','Reviews','Sentiment'])
dfx1=df=pd.DataFrame(x1,columns=['Reviews'])
pos_conditional_dev,neg_conditional_dev= cp(dfd1,dfx1)
acc=prediction(s_review[0],s_sentiment[0],pos_conditional_dev,neg_conditional_dev)
k_acc.append(acc)
d2=np.concatenate((d_split[0],d_split[2],d_split[3],d_split[4]))
x2=np.concatenate((s_review[0],s_review[2],s_review[3],s_review[4]))
dfd2=pd.DataFrame(d2,columns=['ID','Reviews','Sentiment'])
dfx2=df=pd.DataFrame(x2,columns=['Reviews'])
pos_conditional_dev,neg_contional_dev= cp(dfd2,dfx2)
acc=prediction(s_review[1],s_sentiment[1],pos_conditional_dev,neg_conditional_dev)
k_acc.append(acc)
d3=np.concatenate((d_split[0],d_split[1],d_split[3],d_split[4]))
x3=np.concatenate((s_review[0],s_review[1],s_review[3],s_review[4]))
dfd3=pd.DataFrame(d3,columns=['ID','Reviews','Sentiment'])
dfx3=df=pd.DataFrame(x3,columns=['Reviews'])
pos_conditional_dev,neg_conditional_dev=cp(dfd3,dfx3)
acc=prediction(s_review[2],s_sentiment[2],pos_conditional_dev,neg_conditional_dev)
k_acc.append(acc)
d4=np.concatenate((d_split[0],d_split[1],d_split[2],d_split[4]))
x4=np.concatenate((s_review[0],s_review[1],s_review[2],s_review[4]))
dfd4=pd.DataFrame(d4,columns=['ID','Reviews','Sentiment'])
dfx4=df=pd.DataFrame(x4,columns=['Reviews'])
pos_conditional_dev,neg_conditional_dev=cp(dfd4,dfx4)
acc=prediction(s_review[3],s_sentiment[3],pos_conditional_dev,neg_conditional_dev)
k_acc.append(acc)
d5=np.concatenate((d_split[0],d_split[1],d_split[2],d_split[3]))
x5=np.concatenate((s_review[0],s_review[1],s_review[2],s_review[3]))
dfd5=pd.DataFrame(d5,columns=['ID','Reviews','Sentiment'])
dfx5=df=pd.DataFrame(x5,columns=['Reviews'])
pos_conditional_dev,neg_conditional_dev=cp(dfd5,dfx5)
acc=prediction(s_review[4],s_sentiment[4],pos_conditional_dev,neg_conditional_dev)
k_acc.append(acc)
return k_acc
accur=k_fold(dev_data)
print(accur)
print("Average = " ,sum(accur)/len(accur))
best - 10 Conditional probability of best having positive sentiment is 0.3333333333333333 movie - 35 Conditional probability of movie having positive sentiment is 1.1666666666666667 good - 14 Conditional probability of good having positive sentiment is 0.4666666666666667 look - 6 Conditional probability of look having positive sentiment is 0.2 film - 33 Conditional probability of film having positive sentiment is 1.1 ever - 6 Conditional probability of ever having positive sentiment is 0.2 10 - 9 Conditional probability of 10 having positive sentiment is 0.3 interesting - 6 Conditional probability of interesting having positive sentiment is 0.2 well - 12 Conditional probability of well having positive sentiment is 0.4 time - 7 Conditional probability of time having positive sentiment is 0.23333333333333334 one - 11 Conditional probability of one having positive sentiment is 0.36666666666666664 worth - 7 Conditional probability of worth having positive sentiment is 0.23333333333333334 funny - 10 Conditional probability of funny having positive sentiment is 0.3333333333333333 characters - 9 Conditional probability of characters having positive sentiment is 0.3 cast - 8 Conditional probability of cast having positive sentiment is 0.26666666666666666 way - 6 Conditional probability of way having positive sentiment is 0.2 think - 7 Conditional probability of think having positive sentiment is 0.23333333333333334 really - 14 Conditional probability of really having positive sentiment is 0.4666666666666667 beautiful - 6 Conditional probability of beautiful having positive sentiment is 0.2 great - 12 Conditional probability of great having positive sentiment is 0.4 anyone - 8 Conditional probability of anyone having positive sentiment is 0.26666666666666666 see - 12 Conditional probability of see having positive sentiment is 0.4 like - 6 Conditional probability of like having positive sentiment is 0.2 love - 7 Conditional probability of love having positive sentiment is 0.23333333333333334 every - 6 Conditional probability of every having positive sentiment is 0.2 movie - 50 Conditional probability of movie having positive sentiment is 1.6666666666666667 characters - 10 Conditional probability of characters having positive sentiment is 0.3333333333333333 even - 22 Conditional probability of even having positive sentiment is 0.7333333333333333 acting - 16 Conditional probability of acting having positive sentiment is 0.5333333333333333 plot - 16 Conditional probability of plot having positive sentiment is 0.5333333333333333 almost - 6 Conditional probability of almost having positive sentiment is 0.2 little - 11 Conditional probability of little having positive sentiment is 0.36666666666666664 see - 12 Conditional probability of see having positive sentiment is 0.4 film - 33 Conditional probability of film having positive sentiment is 1.1 1 - 6 Conditional probability of 1 having positive sentiment is 0.2 enough - 6 Conditional probability of enough having positive sentiment is 0.2 films - 6 Conditional probability of films having positive sentiment is 0.2 one - 10 Conditional probability of one having positive sentiment is 0.3333333333333333 waste - 7 Conditional probability of waste having positive sentiment is 0.23333333333333334 good - 10 Conditional probability of good having positive sentiment is 0.3333333333333333 time - 11 Conditional probability of time having positive sentiment is 0.36666666666666664 character - 6 Conditional probability of character having positive sentiment is 0.2 make - 12 Conditional probability of make having positive sentiment is 0.4 would - 14 Conditional probability of would having positive sentiment is 0.4666666666666667 also - 9 Conditional probability of also having positive sentiment is 0.3 much - 13 Conditional probability of much having positive sentiment is 0.43333333333333335 like - 14 Conditional probability of like having positive sentiment is 0.4666666666666667 story - 6 Conditional probability of story having positive sentiment is 0.2 real - 7 Conditional probability of real having positive sentiment is 0.23333333333333334 work - 10 Conditional probability of work having positive sentiment is 0.3333333333333333 scenes - 6 Conditional probability of scenes having positive sentiment is 0.2 ever - 7 Conditional probability of ever having positive sentiment is 0.23333333333333334 seen - 7 Conditional probability of seen having positive sentiment is 0.23333333333333334 cast - 6 Conditional probability of cast having positive sentiment is 0.2 10 - 7 Conditional probability of 10 having positive sentiment is 0.23333333333333334 annoying - 6 Conditional probability of annoying having positive sentiment is 0.2 actors - 7 Conditional probability of actors having positive sentiment is 0.23333333333333334 well - 8 Conditional probability of well having positive sentiment is 0.26666666666666666 dialogue - 6 Conditional probability of dialogue having positive sentiment is 0.2 really - 12 Conditional probability of really having positive sentiment is 0.4 great - 6 Conditional probability of great having positive sentiment is 0.2 things - 9 Conditional probability of things having positive sentiment is 0.3 could - 11 Conditional probability of could having positive sentiment is 0.36666666666666664 writing - 8 Conditional probability of writing having positive sentiment is 0.26666666666666666 made - 9 Conditional probability of made having positive sentiment is 0.3 script - 11 Conditional probability of script having positive sentiment is 0.36666666666666664 nothing - 7 Conditional probability of nothing having positive sentiment is 0.23333333333333334 watch - 7 Conditional probability of watch having positive sentiment is 0.23333333333333334 bad - 31 Conditional probability of bad having positive sentiment is 1.0333333333333334 never - 6 Conditional probability of never having positive sentiment is 0.2 watching - 8 Conditional probability of watching having positive sentiment is 0.26666666666666666 stupid - 12 Conditional probability of stupid having positive sentiment is 0.4 awful - 10 Conditional probability of awful having positive sentiment is 0.3333333333333333 get - 7 Conditional probability of get having positive sentiment is 0.23333333333333334 way - 8 Conditional probability of way having positive sentiment is 0.26666666666666666 pretty - 6 Conditional probability of pretty having positive sentiment is 0.2 terrible - 8 Conditional probability of terrible having positive sentiment is 0.26666666666666666 worse - 8 Conditional probability of worse having positive sentiment is 0.26666666666666666 minutes - 6 Conditional probability of minutes having positive sentiment is 0.2 thing - 6 Conditional probability of thing having positive sentiment is 0.2 know - 6 Conditional probability of know having positive sentiment is 0.2 show - 10 Conditional probability of show having positive sentiment is 0.3333333333333333 going - 6 Conditional probability of going having positive sentiment is 0.2 best - 10 Conditional probability of best having positive sentiment is 0.30303030303030304 movie - 35 Conditional probability of movie having positive sentiment is 1.0606060606060606 good - 14 Conditional probability of good having positive sentiment is 0.42424242424242425 look - 6 Conditional probability of look having positive sentiment is 0.18181818181818182 film - 33 Conditional probability of film having positive sentiment is 1.0 ever - 6 Conditional probability of ever having positive sentiment is 0.18181818181818182 10 - 9 Conditional probability of 10 having positive sentiment is 0.2727272727272727 interesting - 6 Conditional probability of interesting having positive sentiment is 0.18181818181818182 well - 12 Conditional probability of well having positive sentiment is 0.36363636363636365 time - 7 Conditional probability of time having positive sentiment is 0.21212121212121213 one - 11 Conditional probability of one having positive sentiment is 0.3333333333333333 worth - 7 Conditional probability of worth having positive sentiment is 0.21212121212121213 funny - 10 Conditional probability of funny having positive sentiment is 0.30303030303030304 characters - 9 Conditional probability of characters having positive sentiment is 0.2727272727272727 cast - 8 Conditional probability of cast having positive sentiment is 0.24242424242424243 way - 6 Conditional probability of way having positive sentiment is 0.18181818181818182 think - 7 Conditional probability of think having positive sentiment is 0.21212121212121213 really - 14 Conditional probability of really having positive sentiment is 0.42424242424242425 beautiful - 6 Conditional probability of beautiful having positive sentiment is 0.18181818181818182 great - 12 Conditional probability of great having positive sentiment is 0.36363636363636365 anyone - 8 Conditional probability of anyone having positive sentiment is 0.24242424242424243 see - 12 Conditional probability of see having positive sentiment is 0.36363636363636365 like - 6 Conditional probability of like having positive sentiment is 0.18181818181818182 love - 7 Conditional probability of love having positive sentiment is 0.21212121212121213 every - 6 Conditional probability of every having positive sentiment is 0.18181818181818182 movie - 50 Conditional probability of movie having positive sentiment is 1.8518518518518519 characters - 10 Conditional probability of characters having positive sentiment is 0.37037037037037035 even - 22 Conditional probability of even having positive sentiment is 0.8148148148148148 acting - 16 Conditional probability of acting having positive sentiment is 0.5925925925925926 plot - 16 Conditional probability of plot having positive sentiment is 0.5925925925925926 almost - 6 Conditional probability of almost having positive sentiment is 0.2222222222222222 little - 11 Conditional probability of little having positive sentiment is 0.4074074074074074 see - 12 Conditional probability of see having positive sentiment is 0.4444444444444444 film - 33 Conditional probability of film having positive sentiment is 1.2222222222222223 1 - 6 Conditional probability of 1 having positive sentiment is 0.2222222222222222 enough - 6 Conditional probability of enough having positive sentiment is 0.2222222222222222 films - 6 Conditional probability of films having positive sentiment is 0.2222222222222222 one - 10 Conditional probability of one having positive sentiment is 0.37037037037037035 waste - 7 Conditional probability of waste having positive sentiment is 0.25925925925925924 good - 10 Conditional probability of good having positive sentiment is 0.37037037037037035 time - 11 Conditional probability of time having positive sentiment is 0.4074074074074074 character - 6 Conditional probability of character having positive sentiment is 0.2222222222222222 make - 12 Conditional probability of make having positive sentiment is 0.4444444444444444 would - 14 Conditional probability of would having positive sentiment is 0.5185185185185185 also - 9 Conditional probability of also having positive sentiment is 0.3333333333333333 much - 13 Conditional probability of much having positive sentiment is 0.48148148148148145 like - 14 Conditional probability of like having positive sentiment is 0.5185185185185185 story - 6 Conditional probability of story having positive sentiment is 0.2222222222222222 real - 7 Conditional probability of real having positive sentiment is 0.25925925925925924 work - 10 Conditional probability of work having positive sentiment is 0.37037037037037035 scenes - 6 Conditional probability of scenes having positive sentiment is 0.2222222222222222 ever - 7 Conditional probability of ever having positive sentiment is 0.25925925925925924 seen - 7 Conditional probability of seen having positive sentiment is 0.25925925925925924 cast - 6 Conditional probability of cast having positive sentiment is 0.2222222222222222 10 - 7 Conditional probability of 10 having positive sentiment is 0.25925925925925924 annoying - 6 Conditional probability of annoying having positive sentiment is 0.2222222222222222 actors - 7 Conditional probability of actors having positive sentiment is 0.25925925925925924 well - 8 Conditional probability of well having positive sentiment is 0.2962962962962963 dialogue - 6 Conditional probability of dialogue having positive sentiment is 0.2222222222222222 really - 12 Conditional probability of really having positive sentiment is 0.4444444444444444 great - 6 Conditional probability of great having positive sentiment is 0.2222222222222222 things - 9 Conditional probability of things having positive sentiment is 0.3333333333333333 could - 11 Conditional probability of could having positive sentiment is 0.4074074074074074 writing - 8 Conditional probability of writing having positive sentiment is 0.2962962962962963 made - 9 Conditional probability of made having positive sentiment is 0.3333333333333333 script - 11 Conditional probability of script having positive sentiment is 0.4074074074074074 nothing - 7 Conditional probability of nothing having positive sentiment is 0.25925925925925924 watch - 7 Conditional probability of watch having positive sentiment is 0.25925925925925924 bad - 31 Conditional probability of bad having positive sentiment is 1.1481481481481481 never - 6 Conditional probability of never having positive sentiment is 0.2222222222222222 watching - 8 Conditional probability of watching having positive sentiment is 0.2962962962962963 stupid - 12 Conditional probability of stupid having positive sentiment is 0.4444444444444444 awful - 10 Conditional probability of awful having positive sentiment is 0.37037037037037035 get - 7 Conditional probability of get having positive sentiment is 0.25925925925925924 way - 8 Conditional probability of way having positive sentiment is 0.2962962962962963 pretty - 6 Conditional probability of pretty having positive sentiment is 0.2222222222222222 terrible - 8 Conditional probability of terrible having positive sentiment is 0.2962962962962963 worse - 8 Conditional probability of worse having positive sentiment is 0.2962962962962963 minutes - 6 Conditional probability of minutes having positive sentiment is 0.2222222222222222 thing - 6 Conditional probability of thing having positive sentiment is 0.2222222222222222 know - 6 Conditional probability of know having positive sentiment is 0.2222222222222222 show - 10 Conditional probability of show having positive sentiment is 0.37037037037037035 going - 6 Conditional probability of going having positive sentiment is 0.2222222222222222 best - 10 Conditional probability of best having positive sentiment is 0.3225806451612903 movie - 35 Conditional probability of movie having positive sentiment is 1.1290322580645162 good - 14 Conditional probability of good having positive sentiment is 0.45161290322580644 look - 6 Conditional probability of look having positive sentiment is 0.1935483870967742 film - 33 Conditional probability of film having positive sentiment is 1.064516129032258 ever - 6 Conditional probability of ever having positive sentiment is 0.1935483870967742 10 - 9 Conditional probability of 10 having positive sentiment is 0.2903225806451613 interesting - 6 Conditional probability of interesting having positive sentiment is 0.1935483870967742 well - 12 Conditional probability of well having positive sentiment is 0.3870967741935484 time - 7 Conditional probability of time having positive sentiment is 0.22580645161290322 one - 11 Conditional probability of one having positive sentiment is 0.3548387096774194 worth - 7 Conditional probability of worth having positive sentiment is 0.22580645161290322 funny - 10 Conditional probability of funny having positive sentiment is 0.3225806451612903 characters - 9 Conditional probability of characters having positive sentiment is 0.2903225806451613 cast - 8 Conditional probability of cast having positive sentiment is 0.25806451612903225 way - 6 Conditional probability of way having positive sentiment is 0.1935483870967742 think - 7 Conditional probability of think having positive sentiment is 0.22580645161290322 really - 14 Conditional probability of really having positive sentiment is 0.45161290322580644 beautiful - 6 Conditional probability of beautiful having positive sentiment is 0.1935483870967742 great - 12 Conditional probability of great having positive sentiment is 0.3870967741935484 anyone - 8 Conditional probability of anyone having positive sentiment is 0.25806451612903225 see - 12 Conditional probability of see having positive sentiment is 0.3870967741935484 like - 6 Conditional probability of like having positive sentiment is 0.1935483870967742 love - 7 Conditional probability of love having positive sentiment is 0.22580645161290322 every - 6 Conditional probability of every having positive sentiment is 0.1935483870967742 movie - 50 Conditional probability of movie having positive sentiment is 1.7241379310344827 characters - 10 Conditional probability of characters having positive sentiment is 0.3448275862068966 even - 22 Conditional probability of even having positive sentiment is 0.7586206896551724 acting - 16 Conditional probability of acting having positive sentiment is 0.5517241379310345 plot - 16 Conditional probability of plot having positive sentiment is 0.5517241379310345 almost - 6 Conditional probability of almost having positive sentiment is 0.20689655172413793 little - 11 Conditional probability of little having positive sentiment is 0.3793103448275862 see - 12 Conditional probability of see having positive sentiment is 0.41379310344827586 film - 33 Conditional probability of film having positive sentiment is 1.1379310344827587 1 - 6 Conditional probability of 1 having positive sentiment is 0.20689655172413793 enough - 6 Conditional probability of enough having positive sentiment is 0.20689655172413793 films - 6 Conditional probability of films having positive sentiment is 0.20689655172413793 one - 10 Conditional probability of one having positive sentiment is 0.3448275862068966 waste - 7 Conditional probability of waste having positive sentiment is 0.2413793103448276 good - 10 Conditional probability of good having positive sentiment is 0.3448275862068966 time - 11 Conditional probability of time having positive sentiment is 0.3793103448275862 character - 6 Conditional probability of character having positive sentiment is 0.20689655172413793 make - 12 Conditional probability of make having positive sentiment is 0.41379310344827586 would - 14 Conditional probability of would having positive sentiment is 0.4827586206896552 also - 9 Conditional probability of also having positive sentiment is 0.3103448275862069 much - 13 Conditional probability of much having positive sentiment is 0.4482758620689655 like - 14 Conditional probability of like having positive sentiment is 0.4827586206896552 story - 6 Conditional probability of story having positive sentiment is 0.20689655172413793 real - 7 Conditional probability of real having positive sentiment is 0.2413793103448276 work - 10 Conditional probability of work having positive sentiment is 0.3448275862068966 scenes - 6 Conditional probability of scenes having positive sentiment is 0.20689655172413793 ever - 7 Conditional probability of ever having positive sentiment is 0.2413793103448276 seen - 7 Conditional probability of seen having positive sentiment is 0.2413793103448276 cast - 6 Conditional probability of cast having positive sentiment is 0.20689655172413793 10 - 7 Conditional probability of 10 having positive sentiment is 0.2413793103448276 annoying - 6 Conditional probability of annoying having positive sentiment is 0.20689655172413793 actors - 7 Conditional probability of actors having positive sentiment is 0.2413793103448276 well - 8 Conditional probability of well having positive sentiment is 0.27586206896551724 dialogue - 6 Conditional probability of dialogue having positive sentiment is 0.20689655172413793 really - 12 Conditional probability of really having positive sentiment is 0.41379310344827586 great - 6 Conditional probability of great having positive sentiment is 0.20689655172413793 things - 9 Conditional probability of things having positive sentiment is 0.3103448275862069 could - 11 Conditional probability of could having positive sentiment is 0.3793103448275862 writing - 8 Conditional probability of writing having positive sentiment is 0.27586206896551724 made - 9 Conditional probability of made having positive sentiment is 0.3103448275862069 script - 11 Conditional probability of script having positive sentiment is 0.3793103448275862 nothing - 7 Conditional probability of nothing having positive sentiment is 0.2413793103448276 watch - 7 Conditional probability of watch having positive sentiment is 0.2413793103448276 bad - 31 Conditional probability of bad having positive sentiment is 1.0689655172413792 never - 6 Conditional probability of never having positive sentiment is 0.20689655172413793 watching - 8 Conditional probability of watching having positive sentiment is 0.27586206896551724 stupid - 12 Conditional probability of stupid having positive sentiment is 0.41379310344827586 awful - 10 Conditional probability of awful having positive sentiment is 0.3448275862068966 get - 7 Conditional probability of get having positive sentiment is 0.2413793103448276 way - 8 Conditional probability of way having positive sentiment is 0.27586206896551724 pretty - 6 Conditional probability of pretty having positive sentiment is 0.20689655172413793 terrible - 8 Conditional probability of terrible having positive sentiment is 0.27586206896551724 worse - 8 Conditional probability of worse having positive sentiment is 0.27586206896551724 minutes - 6 Conditional probability of minutes having positive sentiment is 0.20689655172413793 thing - 6 Conditional probability of thing having positive sentiment is 0.20689655172413793 know - 6 Conditional probability of know having positive sentiment is 0.20689655172413793 show - 10 Conditional probability of show having positive sentiment is 0.3448275862068966 going - 6 Conditional probability of going having positive sentiment is 0.20689655172413793 best - 10 Conditional probability of best having positive sentiment is 0.2631578947368421 movie - 35 Conditional probability of movie having positive sentiment is 0.9210526315789473 good - 14 Conditional probability of good having positive sentiment is 0.3684210526315789 look - 6 Conditional probability of look having positive sentiment is 0.15789473684210525 film - 33 Conditional probability of film having positive sentiment is 0.868421052631579 ever - 6 Conditional probability of ever having positive sentiment is 0.15789473684210525 10 - 9 Conditional probability of 10 having positive sentiment is 0.23684210526315788 interesting - 6 Conditional probability of interesting having positive sentiment is 0.15789473684210525 well - 12 Conditional probability of well having positive sentiment is 0.3157894736842105 time - 7 Conditional probability of time having positive sentiment is 0.18421052631578946 one - 11 Conditional probability of one having positive sentiment is 0.2894736842105263 worth - 7 Conditional probability of worth having positive sentiment is 0.18421052631578946 funny - 10 Conditional probability of funny having positive sentiment is 0.2631578947368421 characters - 9 Conditional probability of characters having positive sentiment is 0.23684210526315788 cast - 8 Conditional probability of cast having positive sentiment is 0.21052631578947367 way - 6 Conditional probability of way having positive sentiment is 0.15789473684210525 think - 7 Conditional probability of think having positive sentiment is 0.18421052631578946 really - 14 Conditional probability of really having positive sentiment is 0.3684210526315789 beautiful - 6 Conditional probability of beautiful having positive sentiment is 0.15789473684210525 great - 12 Conditional probability of great having positive sentiment is 0.3157894736842105 anyone - 8 Conditional probability of anyone having positive sentiment is 0.21052631578947367 see - 12 Conditional probability of see having positive sentiment is 0.3157894736842105 like - 6 Conditional probability of like having positive sentiment is 0.15789473684210525 love - 7 Conditional probability of love having positive sentiment is 0.18421052631578946 every - 6 Conditional probability of every having positive sentiment is 0.15789473684210525 movie - 50 Conditional probability of movie having positive sentiment is 2.272727272727273 characters - 10 Conditional probability of characters having positive sentiment is 0.45454545454545453 even - 22 Conditional probability of even having positive sentiment is 1.0 acting - 16 Conditional probability of acting having positive sentiment is 0.7272727272727273 plot - 16 Conditional probability of plot having positive sentiment is 0.7272727272727273 almost - 6 Conditional probability of almost having positive sentiment is 0.2727272727272727 little - 11 Conditional probability of little having positive sentiment is 0.5 see - 12 Conditional probability of see having positive sentiment is 0.5454545454545454 film - 33 Conditional probability of film having positive sentiment is 1.5 1 - 6 Conditional probability of 1 having positive sentiment is 0.2727272727272727 enough - 6 Conditional probability of enough having positive sentiment is 0.2727272727272727 films - 6 Conditional probability of films having positive sentiment is 0.2727272727272727 one - 10 Conditional probability of one having positive sentiment is 0.45454545454545453 waste - 7 Conditional probability of waste having positive sentiment is 0.3181818181818182 good - 10 Conditional probability of good having positive sentiment is 0.45454545454545453 time - 11 Conditional probability of time having positive sentiment is 0.5 character - 6 Conditional probability of character having positive sentiment is 0.2727272727272727 make - 12 Conditional probability of make having positive sentiment is 0.5454545454545454 would - 14 Conditional probability of would having positive sentiment is 0.6363636363636364 also - 9 Conditional probability of also having positive sentiment is 0.4090909090909091 much - 13 Conditional probability of much having positive sentiment is 0.5909090909090909 like - 14 Conditional probability of like having positive sentiment is 0.6363636363636364 story - 6 Conditional probability of story having positive sentiment is 0.2727272727272727 real - 7 Conditional probability of real having positive sentiment is 0.3181818181818182 work - 10 Conditional probability of work having positive sentiment is 0.45454545454545453 scenes - 6 Conditional probability of scenes having positive sentiment is 0.2727272727272727 ever - 7 Conditional probability of ever having positive sentiment is 0.3181818181818182 seen - 7 Conditional probability of seen having positive sentiment is 0.3181818181818182 cast - 6 Conditional probability of cast having positive sentiment is 0.2727272727272727 10 - 7 Conditional probability of 10 having positive sentiment is 0.3181818181818182 annoying - 6 Conditional probability of annoying having positive sentiment is 0.2727272727272727 actors - 7 Conditional probability of actors having positive sentiment is 0.3181818181818182 well - 8 Conditional probability of well having positive sentiment is 0.36363636363636365 dialogue - 6 Conditional probability of dialogue having positive sentiment is 0.2727272727272727 really - 12 Conditional probability of really having positive sentiment is 0.5454545454545454 great - 6 Conditional probability of great having positive sentiment is 0.2727272727272727 things - 9 Conditional probability of things having positive sentiment is 0.4090909090909091 could - 11 Conditional probability of could having positive sentiment is 0.5 writing - 8 Conditional probability of writing having positive sentiment is 0.36363636363636365 made - 9 Conditional probability of made having positive sentiment is 0.4090909090909091 script - 11 Conditional probability of script having positive sentiment is 0.5 nothing - 7 Conditional probability of nothing having positive sentiment is 0.3181818181818182 watch - 7 Conditional probability of watch having positive sentiment is 0.3181818181818182 bad - 31 Conditional probability of bad having positive sentiment is 1.4090909090909092 never - 6 Conditional probability of never having positive sentiment is 0.2727272727272727 watching - 8 Conditional probability of watching having positive sentiment is 0.36363636363636365 stupid - 12 Conditional probability of stupid having positive sentiment is 0.5454545454545454 awful - 10 Conditional probability of awful having positive sentiment is 0.45454545454545453 get - 7 Conditional probability of get having positive sentiment is 0.3181818181818182 way - 8 Conditional probability of way having positive sentiment is 0.36363636363636365 pretty - 6 Conditional probability of pretty having positive sentiment is 0.2727272727272727 terrible - 8 Conditional probability of terrible having positive sentiment is 0.36363636363636365 worse - 8 Conditional probability of worse having positive sentiment is 0.36363636363636365 minutes - 6 Conditional probability of minutes having positive sentiment is 0.2727272727272727 thing - 6 Conditional probability of thing having positive sentiment is 0.2727272727272727 know - 6 Conditional probability of know having positive sentiment is 0.2727272727272727 show - 10 Conditional probability of show having positive sentiment is 0.45454545454545453 going - 6 Conditional probability of going having positive sentiment is 0.2727272727272727 best - 10 Conditional probability of best having positive sentiment is 0.25 movie - 35 Conditional probability of movie having positive sentiment is 0.875 good - 14 Conditional probability of good having positive sentiment is 0.35 look - 6 Conditional probability of look having positive sentiment is 0.15 film - 33 Conditional probability of film having positive sentiment is 0.825 ever - 6 Conditional probability of ever having positive sentiment is 0.15 10 - 9 Conditional probability of 10 having positive sentiment is 0.225 interesting - 6 Conditional probability of interesting having positive sentiment is 0.15 well - 12 Conditional probability of well having positive sentiment is 0.3 time - 7 Conditional probability of time having positive sentiment is 0.175 one - 11 Conditional probability of one having positive sentiment is 0.275 worth - 7 Conditional probability of worth having positive sentiment is 0.175 funny - 10 Conditional probability of funny having positive sentiment is 0.25 characters - 9 Conditional probability of characters having positive sentiment is 0.225 cast - 8 Conditional probability of cast having positive sentiment is 0.2 way - 6 Conditional probability of way having positive sentiment is 0.15 think - 7 Conditional probability of think having positive sentiment is 0.175 really - 14 Conditional probability of really having positive sentiment is 0.35 beautiful - 6 Conditional probability of beautiful having positive sentiment is 0.15 great - 12 Conditional probability of great having positive sentiment is 0.3 anyone - 8 Conditional probability of anyone having positive sentiment is 0.2 see - 12 Conditional probability of see having positive sentiment is 0.3 like - 6 Conditional probability of like having positive sentiment is 0.15 love - 7 Conditional probability of love having positive sentiment is 0.175 every - 6 Conditional probability of every having positive sentiment is 0.15 movie - 50 Conditional probability of movie having positive sentiment is 2.5 characters - 10 Conditional probability of characters having positive sentiment is 0.5 even - 22 Conditional probability of even having positive sentiment is 1.1 acting - 16 Conditional probability of acting having positive sentiment is 0.8 plot - 16 Conditional probability of plot having positive sentiment is 0.8 almost - 6 Conditional probability of almost having positive sentiment is 0.3 little - 11 Conditional probability of little having positive sentiment is 0.55 see - 12 Conditional probability of see having positive sentiment is 0.6 film - 33 Conditional probability of film having positive sentiment is 1.65 1 - 6 Conditional probability of 1 having positive sentiment is 0.3 enough - 6 Conditional probability of enough having positive sentiment is 0.3 films - 6 Conditional probability of films having positive sentiment is 0.3 one - 10 Conditional probability of one having positive sentiment is 0.5 waste - 7 Conditional probability of waste having positive sentiment is 0.35 good - 10 Conditional probability of good having positive sentiment is 0.5 time - 11 Conditional probability of time having positive sentiment is 0.55 character - 6 Conditional probability of character having positive sentiment is 0.3 make - 12 Conditional probability of make having positive sentiment is 0.6 would - 14 Conditional probability of would having positive sentiment is 0.7 also - 9 Conditional probability of also having positive sentiment is 0.45 much - 13 Conditional probability of much having positive sentiment is 0.65 like - 14 Conditional probability of like having positive sentiment is 0.7 story - 6 Conditional probability of story having positive sentiment is 0.3 real - 7 Conditional probability of real having positive sentiment is 0.35 work - 10 Conditional probability of work having positive sentiment is 0.5 scenes - 6 Conditional probability of scenes having positive sentiment is 0.3 ever - 7 Conditional probability of ever having positive sentiment is 0.35 seen - 7 Conditional probability of seen having positive sentiment is 0.35 cast - 6 Conditional probability of cast having positive sentiment is 0.3 10 - 7 Conditional probability of 10 having positive sentiment is 0.35 annoying - 6 Conditional probability of annoying having positive sentiment is 0.3 actors - 7 Conditional probability of actors having positive sentiment is 0.35 well - 8 Conditional probability of well having positive sentiment is 0.4 dialogue - 6 Conditional probability of dialogue having positive sentiment is 0.3 really - 12 Conditional probability of really having positive sentiment is 0.6 great - 6 Conditional probability of great having positive sentiment is 0.3 things - 9 Conditional probability of things having positive sentiment is 0.45 could - 11 Conditional probability of could having positive sentiment is 0.55 writing - 8 Conditional probability of writing having positive sentiment is 0.4 made - 9 Conditional probability of made having positive sentiment is 0.45 script - 11 Conditional probability of script having positive sentiment is 0.55 nothing - 7 Conditional probability of nothing having positive sentiment is 0.35 watch - 7 Conditional probability of watch having positive sentiment is 0.35 bad - 31 Conditional probability of bad having positive sentiment is 1.55 never - 6 Conditional probability of never having positive sentiment is 0.3 watching - 8 Conditional probability of watching having positive sentiment is 0.4 stupid - 12 Conditional probability of stupid having positive sentiment is 0.6 awful - 10 Conditional probability of awful having positive sentiment is 0.5 get - 7 Conditional probability of get having positive sentiment is 0.35 way - 8 Conditional probability of way having positive sentiment is 0.4 pretty - 6 Conditional probability of pretty having positive sentiment is 0.3 terrible - 8 Conditional probability of terrible having positive sentiment is 0.4 worse - 8 Conditional probability of worse having positive sentiment is 0.4 minutes - 6 Conditional probability of minutes having positive sentiment is 0.3 thing - 6 Conditional probability of thing having positive sentiment is 0.3 know - 6 Conditional probability of know having positive sentiment is 0.3 show - 10 Conditional probability of show having positive sentiment is 0.5 going - 6 Conditional probability of going having positive sentiment is 0.3 [13.333333333333334, 33.33333333333333, 20.0, 66.66666666666666, 80.0] Average = 42.666666666666664
train_x=train_data['Reviews']
def smooth(train_data,train_x):
positive_conditional={};
negative_conditional={};
for word in positive_words:
positive_conditional[word]=(positive_words[word]+1)/(len(train_data.loc[train_data.Sentiment==1].Reviews)+len(positive_words))
print(word ,"-" , positive_words[word]," Conditional probability of ",word ,"having positive sentiment is ", positive_conditional[word])
for words in negative_words:
negative_conditional[words]=(negative_words[words]+1)/(len(train_data.loc[train_data.Sentiment==0].Reviews)+len(negative_words))
print(words ,"-" , negative_words[words]," Conditional probability of ",words," having positive sentiment is ", negative_conditional[words])
return positive_conditional,negative_conditional
positive_conditional,negative_conditional = smooth(train_data,train_x)
best - 10 Conditional probability of best having positive sentiment is 0.051643192488262914 movie - 35 Conditional probability of movie having positive sentiment is 0.16901408450704225 good - 14 Conditional probability of good having positive sentiment is 0.07042253521126761 look - 6 Conditional probability of look having positive sentiment is 0.03286384976525822 film - 33 Conditional probability of film having positive sentiment is 0.1596244131455399 ever - 6 Conditional probability of ever having positive sentiment is 0.03286384976525822 10 - 9 Conditional probability of 10 having positive sentiment is 0.046948356807511735 interesting - 6 Conditional probability of interesting having positive sentiment is 0.03286384976525822 well - 12 Conditional probability of well having positive sentiment is 0.06103286384976526 time - 7 Conditional probability of time having positive sentiment is 0.03755868544600939 one - 11 Conditional probability of one having positive sentiment is 0.056338028169014086 worth - 7 Conditional probability of worth having positive sentiment is 0.03755868544600939 funny - 10 Conditional probability of funny having positive sentiment is 0.051643192488262914 characters - 9 Conditional probability of characters having positive sentiment is 0.046948356807511735 cast - 8 Conditional probability of cast having positive sentiment is 0.04225352112676056 way - 6 Conditional probability of way having positive sentiment is 0.03286384976525822 think - 7 Conditional probability of think having positive sentiment is 0.03755868544600939 really - 14 Conditional probability of really having positive sentiment is 0.07042253521126761 beautiful - 6 Conditional probability of beautiful having positive sentiment is 0.03286384976525822 great - 12 Conditional probability of great having positive sentiment is 0.06103286384976526 anyone - 8 Conditional probability of anyone having positive sentiment is 0.04225352112676056 see - 12 Conditional probability of see having positive sentiment is 0.06103286384976526 like - 6 Conditional probability of like having positive sentiment is 0.03286384976525822 love - 7 Conditional probability of love having positive sentiment is 0.03755868544600939 every - 6 Conditional probability of every having positive sentiment is 0.03286384976525822 movie - 50 Conditional probability of movie having positive sentiment is 0.16037735849056603 characters - 10 Conditional probability of characters having positive sentiment is 0.03459119496855346 even - 22 Conditional probability of even having positive sentiment is 0.07232704402515723 acting - 16 Conditional probability of acting having positive sentiment is 0.05345911949685535 plot - 16 Conditional probability of plot having positive sentiment is 0.05345911949685535 almost - 6 Conditional probability of almost having positive sentiment is 0.0220125786163522 little - 11 Conditional probability of little having positive sentiment is 0.03773584905660377 see - 12 Conditional probability of see having positive sentiment is 0.040880503144654086 film - 33 Conditional probability of film having positive sentiment is 0.1069182389937107 1 - 6 Conditional probability of 1 having positive sentiment is 0.0220125786163522 enough - 6 Conditional probability of enough having positive sentiment is 0.0220125786163522 films - 6 Conditional probability of films having positive sentiment is 0.0220125786163522 one - 10 Conditional probability of one having positive sentiment is 0.03459119496855346 waste - 7 Conditional probability of waste having positive sentiment is 0.025157232704402517 good - 10 Conditional probability of good having positive sentiment is 0.03459119496855346 time - 11 Conditional probability of time having positive sentiment is 0.03773584905660377 character - 6 Conditional probability of character having positive sentiment is 0.0220125786163522 make - 12 Conditional probability of make having positive sentiment is 0.040880503144654086 would - 14 Conditional probability of would having positive sentiment is 0.04716981132075472 also - 9 Conditional probability of also having positive sentiment is 0.031446540880503145 much - 13 Conditional probability of much having positive sentiment is 0.0440251572327044 like - 14 Conditional probability of like having positive sentiment is 0.04716981132075472 story - 6 Conditional probability of story having positive sentiment is 0.0220125786163522 real - 7 Conditional probability of real having positive sentiment is 0.025157232704402517 work - 10 Conditional probability of work having positive sentiment is 0.03459119496855346 scenes - 6 Conditional probability of scenes having positive sentiment is 0.0220125786163522 ever - 7 Conditional probability of ever having positive sentiment is 0.025157232704402517 seen - 7 Conditional probability of seen having positive sentiment is 0.025157232704402517 cast - 6 Conditional probability of cast having positive sentiment is 0.0220125786163522 10 - 7 Conditional probability of 10 having positive sentiment is 0.025157232704402517 annoying - 6 Conditional probability of annoying having positive sentiment is 0.0220125786163522 actors - 7 Conditional probability of actors having positive sentiment is 0.025157232704402517 well - 8 Conditional probability of well having positive sentiment is 0.02830188679245283 dialogue - 6 Conditional probability of dialogue having positive sentiment is 0.0220125786163522 really - 12 Conditional probability of really having positive sentiment is 0.040880503144654086 great - 6 Conditional probability of great having positive sentiment is 0.0220125786163522 things - 9 Conditional probability of things having positive sentiment is 0.031446540880503145 could - 11 Conditional probability of could having positive sentiment is 0.03773584905660377 writing - 8 Conditional probability of writing having positive sentiment is 0.02830188679245283 made - 9 Conditional probability of made having positive sentiment is 0.031446540880503145 script - 11 Conditional probability of script having positive sentiment is 0.03773584905660377 nothing - 7 Conditional probability of nothing having positive sentiment is 0.025157232704402517 watch - 7 Conditional probability of watch having positive sentiment is 0.025157232704402517 bad - 31 Conditional probability of bad having positive sentiment is 0.10062893081761007 never - 6 Conditional probability of never having positive sentiment is 0.0220125786163522 watching - 8 Conditional probability of watching having positive sentiment is 0.02830188679245283 stupid - 12 Conditional probability of stupid having positive sentiment is 0.040880503144654086 awful - 10 Conditional probability of awful having positive sentiment is 0.03459119496855346 get - 7 Conditional probability of get having positive sentiment is 0.025157232704402517 way - 8 Conditional probability of way having positive sentiment is 0.02830188679245283 pretty - 6 Conditional probability of pretty having positive sentiment is 0.0220125786163522 terrible - 8 Conditional probability of terrible having positive sentiment is 0.02830188679245283 worse - 8 Conditional probability of worse having positive sentiment is 0.02830188679245283 minutes - 6 Conditional probability of minutes having positive sentiment is 0.0220125786163522 thing - 6 Conditional probability of thing having positive sentiment is 0.0220125786163522 know - 6 Conditional probability of know having positive sentiment is 0.0220125786163522 show - 10 Conditional probability of show having positive sentiment is 0.03459119496855346 going - 6 Conditional probability of going having positive sentiment is 0.0220125786163522
Now,performing accuracy after smoothing
def prediction(data,sn):
Sentiment_prediction=[]
for statement in data:
pos_flag=1
neg_flag=1
for word in statement.split(' '):
word=word.lower()
if word not in positive_conditional.keys():
positive_conditional[word]=1/(len(train_data.loc[train_data.Sentiment==1].Reviews) + len(positive_words))
if word not in negative_conditional.keys():
negative_conditional[word]=1/(len(train_data.loc[train_data.Sentiment==0].Reviews) + len(negative_words))
pos_flag*=positive_conditional[word]
neg_flag*=negative_conditional[word]
pos_probab=pos_flag*probab[1]
neg_probab=neg_flag*probab[0]
if pos_probab>neg_probab:
Sentiment_prediction.append(1)
else:
Sentiment_prediction.append(0)
return f1score(sn.tolist(),Sentiment_prediction)
##print("Accuracy on training data: ", prediction(train_data['Reviews'],train_data['Sentiment']))
print("Accuracy on test data: ", prediction(test_data['Reviews'],test_data['Sentiment']))
#print("Accuracy on dev data: ", prediction(dev_data['Reviews'],dev_data['Sentiment']))
Accuracy on test data: 71.11111111111111
Finally we have accuracy of 71.11% which can be futher improved by removing stopwords.
Top words:
top_words=dict()
for i in wordlist:
if((i in positive_words)and (i in negative_words)):
top_words[i]= (positive_words[i]-negative_words[i])/wordlist[i]
for i in top_words.copy():
if i in top_words:
if(top_words[i]<0):
top_words.pop(i)
top_words
{'10': 0.125, 'cast': 0.14285714285714285, 'film': 0.0, 'good': 0.16666666666666666, 'great': 0.3333333333333333, 'one': 0.047619047619047616, 'really': 0.07692307692307693, 'see': 0.0, 'well': 0.2}