Author: Yihan Bian
Association rule mining, at a basic level, involves the use of machine learning models to analyze data for patterns, or co-occurrences, in a database. It identifies frequent if-then associations, which themselves are the association rules. An association rule has two parts: an antecedent (if) and a consequent (then).
Most machine learning algorithms work with numeric datasets and hence tend to be mathematical. However, association rule mining is suitable for non-numeric, categorical data and requires just a little bit more than simple counting.
In this case association rule mining is the best choice, since we need to process the non-numeric text data (tweets) from Twitter.
Association rule mining is a procedure which aims to observe frequently occurring patterns, correlations, or associations from datasets found in various kinds of databases such as relational databases, transactional databases, and other forms of repositories.
import nltk
nltk.download('omw-1.4')
import string
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from apyori import apriori
import networkx as nx
[nltk_data] Downloading package omw-1.4 to [nltk_data] /Users/nathanbian/nltk_data... [nltk_data] Package omw-1.4 is already up-to-date!
The following code is needed to read, clean, and convert the text into a format suitable for ARM
First, we find that there exists a lot of emojis in the text data which is not good for this task. So we need to find these emojis and remove them.
And also there exists a lot of nonsense words like username hand hash tag which are not helpful in our task and need to be filtered out.
import re
def remove_emojis(data):
emoj = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002500-\U00002BEF" # chinese char
u"\U00002702-\U000027B0"
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
u"\U0001f926-\U0001f937"
u"\U00010000-\U0010ffff"
u"\u2640-\u2642"
u"\u2600-\u2B55"
u"\u200d"
u"\u23cf"
u"\u23e9"
u"\u231a"
u"\ufe0f" # dingbats
u"\u3030"
"]+", re.UNICODE)
return re.sub(emoj, '', data)
import pandas as pd
input_data = '../../data/00-raw-data/Metaverse_tweets.csv'
df = pd.read_csv(input_data)
text_list = list(df.text)
def remove_word(text, char):
my_str = text
char = char
result = " ".join(
filter(
lambda word: not word.startswith(char), my_str.split()
)
)
return result
new_texts = []
i=0
for text in text_list:
i += 1
try:
if (i > 10000):
break
new_text = remove_emojis(text)
new_text = remove_word(new_text, '#')
new_text = remove_word(new_text, '@')
new_text = remove_word(new_text, '$')
new_text = remove_word(new_text, 'http')
new_text = remove_word(new_text, '0x')
if (len(new_text) > 20):
new_texts.append(new_text)
except:
continue
new_texts = set(new_texts)
new_texts = list(new_texts)
text_data = "\n".join(new_texts)
with open('TEXTDATA.txt', 'w') as f:
f.write(text_data)
#USER PARAM
input_path = 'TEXTDATA.txt'
compute_sentiment = True
sentiment = [] #average sentiment of each chunck of text
ave_window_size = 250 #size of scanning window for moving average
#OUTPUT FILE
output='transactions.txt'
if os.path.exists(output):
os.remove(output)
#INITIALIZE
lemmatizer = WordNetLemmatizer()
ps = PorterStemmer()
sia = SentimentIntensityAnalyzer()
#ADD MORE
stopwords = stopwords.words('english')
add=['mr','mrs','wa','dr','said','back','could','one','looked','like','know','around','dont']
for sp in add:
stopwords.append(sp)
def read_and_clean(path,START=0,STOP=-1):
global sentiment
#-----------------------
#INSERT CODE TO READ IN AS ONE BIG STRING
#-----------------------
file = open(path, 'rt')
text = file.read().lower() #CONVERT TO LOWER CASE
file.close()
#REMOVE HEADER, AND NEW LINES
text=text.replace("'",'') #wasn't --> wasnt
lines = text.splitlines(); text='';
lines=lines[START:STOP] # mystring.replace('\n', ' ')
for line in lines: text=text+' '+line
#-----------------------
#INSERT CODE TO ONLY KEEP CHAR IN string.printable
#-----------------------
tmp = ''
printable = set(string.printable)
for char in text:
if (char in printable):
tmp = tmp + char
text = tmp
#BREAK INTO CHUNKS (SENTANCES OR OTHERWISE)
sentences=nltk.tokenize.sent_tokenize(text) #SENTENCES
print("NUMBER OF SENTENCES FOUND:",len(sentences)); #print(sentences)
#CLEAN AND LEMMATIZE
keep='0123456789abcdefghijklmnopqrstuvwxy';
new_sentences=[]; vocabulary=[]
for sentence in sentences:
new_sentence=''
# REBUILD LEMITIZED SENTENCE
for word in sentence.split():
#ONLY KEEP CHAR IN "keep"
tmp2=''
for char in word:
if(char in keep):
tmp2=tmp2+char
else:
tmp2=tmp2+' '
word=tmp2
#-----------------------
# INSERT CODE TO LEMMATIZE THE WORDS
#-----------------------
new_word = lemmatizer.lemmatize(word)
#REMOVE WHITE SPACES
new_word=new_word.replace(' ', '')
#BUILD NEW SENTANCE BACK UP
if( new_word not in stopwords):
if(new_sentence==''):
new_sentence=new_word
else:
new_sentence=new_sentence+','+new_word
if(new_word not in vocabulary): vocabulary.append(new_word)
#SAVE (LIST OF LISTS)
new_sentences.append(new_sentence.split(","))
#SIA
if(compute_sentiment):
#-----------------------
# INSERT CODE TO USE NLTK TO DO SENTIMENT ANALYSIS
#-----------------------
text1=new_sentence.replace(',',' ')
score=sia.polarity_scores(text1)
sentiment.append([score['neg'],score['neu'],score['pos'],score['compound']])
#SAVE SENTANCE TO OUTPUT FILE
if(len(new_sentence.split(','))>2):
f = open(output, "a")
f.write(new_sentence+"\n")
f.close()
sentiment=np.array(sentiment)
print("TOTAL AVERAGE SENTEMENT:",np.mean(sentiment,axis=0))
print("VOCAB LENGTH",len(vocabulary))
return new_sentences
transactions=read_and_clean(input_path,400,-400)
print(transactions[0:5])
NUMBER OF SENTENCES FOUND: 3879 TOTAL AVERAGE SENTEMENT: [0.03992137 0.66198144 0.29190797 0.34684725] VOCAB LENGTH 6664 [['maticverse', 'next', 'listing', 'playtoearn', 'category', 'binance', 'p2e', 'maticverse', 'amp', 'enjoy', 'gain', 'amp', 'free', 'earnings', 'wonderful', 'project', 'project', 'good', 'project', 'ha', 'lot', 'attractionsgo', 'moon', 'future', 'meta', 'already', 'miss', 'trip', 'moon'], ['buy', 'httpstcolrcshhcsyh', 'youre', 'looking', 'art', 'also', 'legit', 'utility', 'def', 'check', 'project', 'connecting', 'real', 'world', 'let', 'go'], ['pedal', 'metal', 'moving', 'world', 'new', 'exchange', 'month'], ['target', 'got', 'higher'], ['double', 'digit', 'coming', 'smart', 'man', 'miss']]
def moving_ave(y,w=100):
#-----------------------
# INSERT CODE TO COMPUTE THE MOVING AVERAGE OF A SIGNAL Y
#-----------------------
mask = np.ones((1,w))/w
mask = mask[0,:]
return np.convolve(y, mask, 'same')
# INSERT CODE TO VISUALIZE THE SENTIMENT ANALYSIS AS A TIME-SERIES (SEE PLOT FOR AN EXAMPLE)
neg = moving_ave(sentiment[:,0], ave_window_size)
neg = (neg - np.mean(neg))/np.std(neg)
neu = moving_ave(sentiment[:,1], ave_window_size)
neu = (neu - np.mean(neu))/np.std(neu)
pos = moving_ave(sentiment[:,2], ave_window_size)
pos = (pos - np.mean(pos))/np.std(pos)
compound = moving_ave(sentiment[:,3], ave_window_size)
compound = (compound - np.mean(compound))/np.std(compound)
indx = np.linspace(0, len(sentiment), len(sentiment))
plt.plot(indx, neg, label='negative')
plt.plot(indx, pos, label='positive')
plt.xlabel("text chuncks: progression of text")
plt.ylabel("sentiment")
plt.legend(loc="upper left")
<matplotlib.legend.Legend at 0x7f8d18b452a0>
Here is the graph of the sentiment analysis of each sentence of the whole text data.
We can see that the all these text data has a mix attitude about the topic we want to dig into, which is a good phenomenon since we assume that in the real-world situation people should have both positive and negitive attitude towards a newly created thing.
We define a series of helping function to do the association rule mining analysis more efficiently.
# INSERT CODE TO RE-FORMAT THE APRIORI OUTPUT INTO A PANDAS DATA-FRAME WITH COLUMNS "rhs","lhs","supp","conf","supp x conf","lift"
def reformat_results(results):
keep = []
for i in range(0, len(results)):
for j in range(0, len(list(results[i]))):
if (j > 1):
for k in range(0, len(list(results[i][j]))):
if (len(results[i][j][k][0]) != 0):
rhs = list(results[i][j][k][0])
lhs = list(results[i][j][k][1])
conf = float(results[i][j][k][2])
lift = float(results[i][j][k][3])
keep.append([rhs, lhs, supp, conf, supp*conf, lift])
if (j == 1):
supp = results[i][j]
return pd.DataFrame(data=keep, columns=["rhs","lhs","supp","conf","supp x conf","lift"])
def convert_to_network(df):
print(df)
#BUILD GRAPH
G = nx.DiGraph() # DIRECTED
for row in df.iterrows():
# for column in df.columns:
lhs="_".join(row[1][0])
rhs="_".join(row[1][1])
conf=row[1][3]; #print(conf)
if(lhs not in G.nodes):
G.add_node(lhs)
if(rhs not in G.nodes):
G.add_node(rhs)
edge=(lhs,rhs)
if edge not in G.edges:
G.add_edge(lhs, rhs, weight=conf)
# print(G.nodes)
# print(G.edges)
return G
def plot_network(G):
#SPECIFIY X-Y POSITIONS FOR PLOTTING
pos=nx.random_layout(G)
#GENERATE PLOT
fig, ax = plt.subplots()
fig.set_size_inches(15, 15)
#assign colors based on attributes
weights_e = [G[u][v]['weight'] for u,v in G.edges()]
#SAMPLE CMAP FOR COLORS
cmap=plt.cm.get_cmap('Blues')
colors_e = [cmap(G[u][v]['weight']*10) for u,v in G.edges()]
#PLOT
nx.draw(
G,
edgecolors="black",
edge_color=colors_e,
node_size=2000,
linewidths=2,
font_size=8,
font_color="white",
font_weight="bold",
width=weights_e,
with_labels=True,
pos=pos,
ax=ax
)
ax.set(title='tweets')
plt.show()
Now we have both the data and the setting, with the help of all the helper funciton defined above we can now start our training process of association rules mining model.
# INSERT CODE TO TRAIN THE ARM MODEL USING THE "apriori" PACKAGE
print("Transactions:", pd.DataFrame(transactions).head(6))
results = list(apriori(transactions, min_support=0.02, min_confidence=0.1, min_length=2, max_length=5))
print(len(results))
Transactions: 0 1 2 3 4 5 \ 0 maticverse next listing playtoearn category binance 1 buy httpstcolrcshhcsyh youre looking art also 2 pedal metal moving world new exchange 3 target got higher None None None 4 double digit coming smart man miss 5 good project good job good project 6 7 8 9 ... 76 77 78 79 80 81 \ 0 p2e maticverse amp enjoy ... None None None None None None 1 legit utility def check ... None None None None None None 2 month None None None ... None None None None None None 3 None None None None ... None None None None None None 4 None None None None ... None None None None None None 5 None None None None ... None None None None None None 82 83 84 85 0 None None None None 1 None None None None 2 None None None None 3 None None None None 4 None None None None 5 None None None None [6 rows x 86 columns] 15
These are the part of the result of the whole transaction data. And the transaction data is generated from the text data that we have cleaned
Using the above reformatted transaction data, we can generate the corresponding association rule mining model and we can plot this model in a directed graph format which should be better to be viewed
# INSERT CODE TO PLOT THE RESULTS AS A NETWORK-X OBJECT
pd_results = reformat_results(results)
G = convert_to_network(pd_results)
plot_network(G)
rhs lhs supp conf supp x conf lift 0 [] [holder] 0.020624 0.121581 0.002507 3.930091 1 [holder] [] 0.020624 0.666667 0.013749 3.930091 2 [] [metaverse] 0.023460 0.138298 0.003244 1.800193 3 [metaverse] [] 0.023460 0.305369 0.007164 1.800193 4 [] [project] 0.044857 0.264438 0.011862 1.164306 5 [project] [] 0.044857 0.197503 0.008859 1.164306 6 [amaing] [project] 0.020366 0.699115 0.014238 3.078169 7 [best] [project] 0.032483 0.633166 0.020567 2.787798 8 [project] [best] 0.032483 0.143019 0.004646 2.787798 9 [cap] [market] 0.024749 0.905660 0.022414 26.817226 10 [market] [cap] 0.024749 0.732824 0.018136 26.817226 11 [future] [project] 0.042279 0.585714 0.024763 2.578871 12 [project] [future] 0.042279 0.186152 0.007870 2.578871 13 [good] [project] 0.048982 0.708955 0.034726 3.121495 14 [project] [good] 0.048982 0.215664 0.010564 3.121495 15 [great] [project] 0.048208 0.658451 0.031743 2.899126 16 [project] [great] 0.048208 0.212259 0.010233 2.899126 17 [ha] [project] 0.030162 0.529412 0.015968 2.330974 18 [project] [ha] 0.030162 0.132804 0.004006 2.330974 19 [join] [project] 0.020108 0.357798 0.007195 1.575368 20 [metaverse] [project] 0.021655 0.281879 0.006104 1.241100 21 [project] [team] 0.042021 0.185017 0.007775 2.859287 22 [team] [project] 0.042021 0.649402 0.027289 2.859287
By observing this graph and the table above we can know a lot of information about these text data.
These text data is gathered from the tweets about 'metaverse' which is a new tech word raised by a big tech company called Meta. So now we know that most of the people who tweets about this topic thought that 'metaverse' is a project and most of them have a positive attitude about this project, since there are some rules in this rule table shows that the word 'good', 'best' and 'great' is highly associated with the word 'project'. And it seems that this project is about the market and the holder of something can be selled and buyed in this market.