import nltk
nltk.download('omw-1.4')
import string
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer

import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from apyori import apriori
import networkx as nx

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/nathanbian/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


import re
def remove_emojis(data):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', data)


import pandas as pd
input_data          =   '../../data/00-raw-data/Metaverse_tweets.csv'

df = pd.read_csv(input_data) 
text_list = list(df.text)


def remove_word(text, char):
    my_str = text

    char = char

    result = " ".join(
        filter(
            lambda word: not word.startswith(char), my_str.split()
        )
    )
    return result


new_texts = []
i=0
for text in text_list:
    i += 1
    try: 
        if (i > 10000):
            break
        new_text = remove_emojis(text)
        new_text = remove_word(new_text, '#')
        new_text = remove_word(new_text, '@')
        new_text = remove_word(new_text, '$')
        new_text = remove_word(new_text, 'http')
        new_text = remove_word(new_text, '0x')
        if (len(new_text) > 20):
            new_texts.append(new_text)
    except:
        continue
    
new_texts = set(new_texts)
new_texts = list(new_texts)


text_data = "\n".join(new_texts) 
with open('TEXTDATA.txt', 'w') as f:
   f.write(text_data)


#USER PARAM
input_path			=	'TEXTDATA.txt'
compute_sentiment 	=	True		
sentiment    		=	[]			#average sentiment of each chunck of text 
ave_window_size		=	250			#size of scanning window for moving average
					

#OUTPUT FILE
output='transactions.txt'
if os.path.exists(output): 
    os.remove(output)

#INITIALIZE
lemmatizer 	= 	WordNetLemmatizer()
ps 			=	PorterStemmer()
sia 		= 	SentimentIntensityAnalyzer()

#ADD MORE
stopwords	=	stopwords.words('english')
add=['mr','mrs','wa','dr','said','back','could','one','looked','like','know','around','dont']
for sp in add: 
    stopwords.append(sp)

def read_and_clean(path,START=0,STOP=-1):
	global sentiment 

	#-----------------------
	#INSERT CODE TO READ IN AS ONE BIG STRING
	#-----------------------

	file = open(path, 'rt')
	text = file.read().lower()	#CONVERT TO LOWER CASE
	file.close()
	
 	#REMOVE HEADER, AND NEW LINES
	text=text.replace("'",'') #wasn't --> wasnt
	lines = text.splitlines(); text=''; 
	lines=lines[START:STOP]    # mystring.replace('\n', ' ')
	for line in lines: text=text+' '+line

	#-----------------------
	#INSERT CODE TO ONLY KEEP CHAR IN string.printable
	#-----------------------
	tmp = ''
	printable = set(string.printable)
	for char in text:
		if (char in printable):
			tmp = tmp + char
	text = tmp
 
 
	#BREAK INTO CHUNKS (SENTANCES OR OTHERWISE)
	sentences=nltk.tokenize.sent_tokenize(text)  #SENTENCES

	print("NUMBER OF SENTENCES FOUND:",len(sentences)); #print(sentences)

	#CLEAN AND LEMMATIZE
	keep='0123456789abcdefghijklmnopqrstuvwxy';

	new_sentences=[]; vocabulary=[]
	for sentence in sentences:
		new_sentence=''

		# REBUILD LEMITIZED SENTENCE
		for word in sentence.split():
			
			#ONLY KEEP CHAR IN "keep"
			tmp2=''
			for char in word: 
				if(char in keep): 
					tmp2=tmp2+char
				else:
					tmp2=tmp2+' '
			word=tmp2

			#-----------------------
			# INSERT CODE TO LEMMATIZE THE WORDS
			#-----------------------
			new_word = lemmatizer.lemmatize(word)
   
			#REMOVE WHITE SPACES
			new_word=new_word.replace(' ', '')

			#BUILD NEW SENTANCE BACK UP
			if( new_word not in stopwords):
				if(new_sentence==''):
					new_sentence=new_word
				else:
					new_sentence=new_sentence+','+new_word
				if(new_word not in vocabulary): vocabulary.append(new_word)

		#SAVE (LIST OF LISTS)		
		new_sentences.append(new_sentence.split(","))
		
		#SIA
		if(compute_sentiment):
			#-----------------------
			# INSERT CODE TO USE NLTK TO DO SENTIMENT ANALYSIS 
			#-----------------------
			text1=new_sentence.replace(',',' ')
			score=sia.polarity_scores(text1)
			sentiment.append([score['neg'],score['neu'],score['pos'],score['compound']])
   
		#SAVE SENTANCE TO OUTPUT FILE
		if(len(new_sentence.split(','))>2):
			f = open(output, "a")
			f.write(new_sentence+"\n")
			f.close()

	sentiment=np.array(sentiment)
	print("TOTAL AVERAGE SENTEMENT:",np.mean(sentiment,axis=0))
	print("VOCAB LENGTH",len(vocabulary))
	return new_sentences

transactions=read_and_clean(input_path,400,-400)
print(transactions[0:5])

NUMBER OF SENTENCES FOUND: 3879
TOTAL AVERAGE SENTEMENT: [0.03992137 0.66198144 0.29190797 0.34684725]
VOCAB LENGTH 6664
[['maticverse', 'next', 'listing', 'playtoearn', 'category', 'binance', 'p2e', 'maticverse', 'amp', 'enjoy', 'gain', 'amp', 'free', 'earnings', 'wonderful', 'project', 'project', 'good', 'project', 'ha', 'lot', 'attractionsgo', 'moon', 'future', 'meta', 'already', 'miss', 'trip', 'moon'], ['buy', 'httpstcolrcshhcsyh', 'youre', 'looking', 'art', 'also', 'legit', 'utility', 'def', 'check', 'project', 'connecting', 'real', 'world', 'let', 'go'], ['pedal', 'metal', 'moving', 'world', 'new', 'exchange', 'month'], ['target', 'got', 'higher'], ['double', 'digit', 'coming', 'smart', 'man', 'miss']]


def moving_ave(y,w=100):
	#-----------------------
	# INSERT CODE TO COMPUTE THE MOVING AVERAGE OF A SIGNAL Y
	#-----------------------
	mask = np.ones((1,w))/w
	mask = mask[0,:]
	return np.convolve(y, mask, 'same')

# INSERT CODE TO VISUALIZE THE SENTIMENT ANALYSIS AS A TIME-SERIES (SEE PLOT FOR AN EXAMPLE)
neg = moving_ave(sentiment[:,0], ave_window_size)
neg = (neg - np.mean(neg))/np.std(neg)
neu = moving_ave(sentiment[:,1], ave_window_size)
neu = (neu - np.mean(neu))/np.std(neu)
pos = moving_ave(sentiment[:,2], ave_window_size)
pos = (pos - np.mean(pos))/np.std(pos)
compound = moving_ave(sentiment[:,3], ave_window_size)
compound = (compound - np.mean(compound))/np.std(compound)

indx = np.linspace(0, len(sentiment), len(sentiment))
plt.plot(indx, neg, label='negative')
plt.plot(indx, pos, label='positive')
plt.xlabel("text chuncks: progression of text")
plt.ylabel("sentiment")
plt.legend(loc="upper left")

<matplotlib.legend.Legend at 0x7f8d18b452a0>


# INSERT CODE TO RE-FORMAT THE APRIORI OUTPUT INTO A PANDAS DATA-FRAME WITH COLUMNS "rhs","lhs","supp","conf","supp x conf","lift"
def reformat_results(results):
    keep = []
    for i in range(0, len(results)):
        for j in range(0, len(list(results[i]))):
            if (j > 1):
                for k in range(0, len(list(results[i][j]))):
                    if (len(results[i][j][k][0]) != 0):
                        rhs = list(results[i][j][k][0])
                        lhs = list(results[i][j][k][1])
                        conf = float(results[i][j][k][2])
                        lift = float(results[i][j][k][3])
                        keep.append([rhs, lhs, supp, conf, supp*conf, lift])
            if (j == 1):
                supp = results[i][j]
    return pd.DataFrame(data=keep, columns=["rhs","lhs","supp","conf","supp x conf","lift"])


def convert_to_network(df):
    print(df)

    #BUILD GRAPH
    G = nx.DiGraph()  # DIRECTED
    for row in df.iterrows():
        # for column in df.columns:
        lhs="_".join(row[1][0])
        rhs="_".join(row[1][1])
        conf=row[1][3]; #print(conf)
        if(lhs not in G.nodes): 
            G.add_node(lhs)
        if(rhs not in G.nodes): 
            G.add_node(rhs)

        edge=(lhs,rhs)
        if edge not in G.edges:
            G.add_edge(lhs, rhs, weight=conf)

    # print(G.nodes)
    # print(G.edges)
    return G


def plot_network(G):
    #SPECIFIY X-Y POSITIONS FOR PLOTTING
    pos=nx.random_layout(G)

    #GENERATE PLOT
    fig, ax = plt.subplots()
    fig.set_size_inches(15, 15)

    #assign colors based on attributes
    weights_e 	= [G[u][v]['weight'] for u,v in G.edges()]

    #SAMPLE CMAP FOR COLORS 
    cmap=plt.cm.get_cmap('Blues')
    colors_e 	= [cmap(G[u][v]['weight']*10) for u,v in G.edges()]

    #PLOT
    nx.draw(
    G,
    edgecolors="black",
    edge_color=colors_e,
    node_size=2000,
    linewidths=2,
    font_size=8,
    font_color="white",
    font_weight="bold",
    width=weights_e,
    with_labels=True,
    pos=pos,
    ax=ax
    )
    ax.set(title='tweets')
    plt.show()


# INSERT CODE TO TRAIN THE ARM MODEL USING THE "apriori" PACKAGE
print("Transactions:", pd.DataFrame(transactions).head(6))
results = list(apriori(transactions, min_support=0.02, min_confidence=0.1, min_length=2, max_length=5))
print(len(results))

Transactions:            0                   1        2           3         4         5   \
0  maticverse                next  listing  playtoearn  category   binance   
1         buy  httpstcolrcshhcsyh    youre     looking       art      also   
2       pedal               metal   moving       world       new  exchange   
3      target                 got   higher        None      None      None   
4      double               digit   coming       smart       man      miss   
5        good             project     good         job      good   project   

      6           7     8      9   ...    76    77    78    79    80    81  \
0    p2e  maticverse   amp  enjoy  ...  None  None  None  None  None  None   
1  legit     utility   def  check  ...  None  None  None  None  None  None   
2  month        None  None   None  ...  None  None  None  None  None  None   
3   None        None  None   None  ...  None  None  None  None  None  None   
4   None        None  None   None  ...  None  None  None  None  None  None   
5   None        None  None   None  ...  None  None  None  None  None  None   

     82    83    84    85  
0  None  None  None  None  
1  None  None  None  None  
2  None  None  None  None  
3  None  None  None  None  
4  None  None  None  None  
5  None  None  None  None  

[6 rows x 86 columns]
15


# INSERT CODE TO PLOT THE RESULTS AS A NETWORK-X OBJECT 
pd_results = reformat_results(results)
G = convert_to_network(pd_results)
plot_network(G)

            rhs          lhs      supp      conf  supp x conf       lift
0            []     [holder]  0.020624  0.121581     0.002507   3.930091
1      [holder]           []  0.020624  0.666667     0.013749   3.930091
2            []  [metaverse]  0.023460  0.138298     0.003244   1.800193
3   [metaverse]           []  0.023460  0.305369     0.007164   1.800193
4            []    [project]  0.044857  0.264438     0.011862   1.164306
5     [project]           []  0.044857  0.197503     0.008859   1.164306
6      [amaing]    [project]  0.020366  0.699115     0.014238   3.078169
7        [best]    [project]  0.032483  0.633166     0.020567   2.787798
8     [project]       [best]  0.032483  0.143019     0.004646   2.787798
9         [cap]     [market]  0.024749  0.905660     0.022414  26.817226
10     [market]        [cap]  0.024749  0.732824     0.018136  26.817226
11     [future]    [project]  0.042279  0.585714     0.024763   2.578871
12    [project]     [future]  0.042279  0.186152     0.007870   2.578871
13       [good]    [project]  0.048982  0.708955     0.034726   3.121495
14    [project]       [good]  0.048982  0.215664     0.010564   3.121495
15      [great]    [project]  0.048208  0.658451     0.031743   2.899126
16    [project]      [great]  0.048208  0.212259     0.010233   2.899126
17         [ha]    [project]  0.030162  0.529412     0.015968   2.330974
18    [project]         [ha]  0.030162  0.132804     0.004006   2.330974
19       [join]    [project]  0.020108  0.357798     0.007195   1.575368
20  [metaverse]    [project]  0.021655  0.281879     0.006104   1.241100
21    [project]       [team]  0.042021  0.185017     0.007775   2.859287
22       [team]    [project]  0.042021  0.649402     0.027289   2.859287

ARM¶

Import the basic packages¶

Read and clean the data from ARM task¶

¶

Visualize sentiment¶

Re-format output¶

Utility function: Convert to NetworkX object¶

Utility function: Plot NetworkX object¶

Train ARM model¶

Visualize the results¶

Conclusion¶