[Python] How to extract and export word count from the document

Import packages

import numpy as np 
import pandas as pd 
import re # for regular expression 
import nltk # package for natural language processing 
import csv # for importing csv format data 
import openpyxl # for exporting xlsx format data 
from string import punctuation
from nltk import word_tokenize
from nltk.util import ngrams
from nltk.corpus import stopwords
from nltk import FreqDist

Load dataset

data = pd.read_csv('after.csv') # Loading dataset 
data
data.values # See the few lines of dataset 
values = "".join(str(i) for i in data.values)
values 

Preprocessing

Parsing and tokenizing data using regular expressions

import re
parse = re.sub(r'([^\s\w]|_)+', '', values)
print(parse)

word_token = parse.lower() 
tokenized

Removing Stopwords

stoplist = set(stopwords.words('english') + list(punctuation))
tokens_without_sw = [word for word in tokenized if not word in stopwords.words()] 
Ngram = FreqDist(ngrams(tokens_without_sw, 2)).most_common()

Calculate word count

Ngram = FreqDist(ngrams(tokens_without_sw, 2)).most_common()
pd.DataFrame(Ngram, columns=['word', 'count']).to_csv('after_cooccurence.csv', index=False)
ngramdf = pd.Series(Ngram) ## convert into pandas dataframe 
ngramdf
len(tokens_without_sw) 
tokens_without_sw
# create top 300 keywords (You can change the number of words)
df = pd.Series(tokens_without_sw).value_counts(0).head(300)
pd.Series(tokens_without_sw).value_counts(0).head(300)

Export Results into CSV file

# Export the dataframe into CSV file 
df.to_csv('after_freq.csv', index=True)
  • April 7, 2021