TFIDF keyword extractor deployed using Streamlit

TF-IDF stands for "Term Frequency–Inverse Document Frequency, is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus. tf–idf value increases proportionally to the number of times a word appears in the document and is offset by the number of documents in the corpus that contain the word

Using Scikit learn library to compute tfidf scores on our training data.

-For this demo we will use Stackover posts related dataset which is available at archive.org -Dataset is in XML format &comes out to be about 156 MB size.

!wget https://archive.org/download/stackexchange/cs.stackexchange.com.7z/Posts.xml

Load the dataset using Pandas

import pandas as pd
import re

data = pd.read_xml("Posts.xml")
data = data.Body #we are taking on the text body
data.fillna('', inplace=True)

Preprocess to remove unwanted tags

def pre_process(text):
	'''Preprocess input text'''

	text=text.lower()
	text=re.sub("</?.*?>"," <> ",text) #remove html tags
	text=re.sub("(\\d|\\W)+"," ",text) # remove special characters and digits
	text = text.strip() #remove blank characters
	return text

data = data.apply(lambda x: pre_process(x))

Compute TF-IDF using Sklearn

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(use_idf=True,max_df=0.7)
tfidf_vectorizer_vectors = tfidf_vectorizer.fit_transform(data)

Store the TF-IDF values for later use

with open('tfidf.pickle', 'wb') as f:
    pickle.dump(tfidf_vectorizer, f, pickle.HIGHEST_PROTOCOL)

Now lets utilize Streamlit to create a UI

#import necessary libraries
import streamlit as st
import pandas as pd
import pickle
import re
from sklearn.feature_extraction.text import TfidfVectorizer

#Setting up Streamlit configurations
st.set_page_config(layout='wide', initial_sidebar_state='expanded')
st.title('Keyword extraction using TFIDF')
st.markdown('Display top 5 keywords')
Text = st.text_input('Enter the sentence & press enter key')


@st.cache(allow_output_mutation=True)
def load():
	''' Load the calculated TFIDF weights'''

	data = None
	with open('tfidf.pickle', 'rb') as f:
		data = pickle.load(f)
	return data


def pre_process(text):
	'''Preprocess input text'''

	text=text.lower()
	text=re.sub("</?.*?>"," <> ",text) #remove html tags
	text=re.sub("(\\d|\\W)+"," ",text) # remove special characters and digits
	text = text.strip() #remove blank characters
	return text


def process(tfidf_vectorizer, text):
	'''Compute the Top 5 TFIDF scores'''

	if text is not None and text != '' and text != ' ':
		txt = tfidf_vectorizer.transform([text])
		df = pd.DataFrame(txt.T.todense(), index=tfidf_vectorizer.get_feature_names_out(), columns=["tfidf"])
		print(df)
		if len(text) >= 5:
			df = df.sort_values(by=["tfidf"],ascending=False)[:5]
		else:
			df = df.sort_values(by=["tfidf"],ascending=False)[:len(text)]
		return df
	return ''


def run():
	tfidf_vectorizer = load()
	text = pre_process(Text)
	val = process(tfidf_vectorizer, text)
	print(val, Text)
	st.write(val)

if __name__ == '__main__':
	run()

To check out a live demo, pls visit here