getting data from sites and making summary of it - python

Hi I was working on two different scripts one is getting data through selenium and one is getting summary of data. so Getting data from sites is working fine but when I am passing that data to do summary of the data, the data is not being passed in my summary. please let me know where i am making error and how to fix this. I am new to python selenium.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import time
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
"""
Taking input from user
"""
search_input = input("Input the keyword you want to search for:")
search_input = search_input.replace(' ', '+')
driver = webdriver.Chrome(executable_path="E:\chromedriver\chromedriver.exe")
for i in range(1):
matched_elements = driver.get("https://www.google.com/search?q=" +
search_input + "&start=" + str(i))
print(driver.title)
driver.maximize_window()
time.sleep(5)
links_url = driver.find_elements_by_xpath("//div[#class='yuRUbf']/a[#href]")
links = []
for x in links_url:
links.append(x.get_attribute('href'))
link_data = []
for new_url in links:
# print('\nnew url : ', new_url)
driver.get(new_url)
#Getting the data from the site
try:
link = driver.find_elements(By.TAG_NAME, "p")
for p in link:
datas = p.get_attribute("innerText")
print(datas)
except:
continue
driver.quit()
#getting summary of data
print("\nOriginal text:")
print(datas)
textWordCount = len(datas.split())
print("The number of words in Original text are : " + str(textWordCount))
stopWords = set(stopwords.words("english"))
words = word_tokenize(datas)
freqTable = dict()
for word in words:
word = word.lower()
if word in stopWords:
continue
if word in freqTable:
freqTable[word] += 1
else:
freqTable[word] = 1
sentences = sent_tokenize(datas)
sentenceValue = dict()
for sentence in sentences:
for word, freq in freqTable.items():
if word in sentence.lower():
if sentence in sentenceValue:
sentenceValue[sentence] += freq
else:
sentenceValue[sentence] = freq
sumValues = 0
for sentence in sentenceValue:
sumValues += sentenceValue[sentence]
average = int(sumValues / len(sentenceValue))
summary = ''
for sentence in sentences:
if (sentence in sentenceValue) and (sentenceValue[sentence] > (1.2 * average)):
summary += " " + sentence
print("\nSummary:")
print(summary)
summaryWordCount = len(summary.split())
print("\nThe number of words in summary are : " + str(summaryWordCount))

the problem is with this line:
datas = p.get_attribute("innerText")
this rewrites the value of datas with each iteration of the loop.
I'm guessing that you are really wanting to append to a list, or expand a string with a space between words?

Related

How to get a mean for a int variable within 6 month based on a variable dates

I would like to know how I could have the mean of notes by grouping dates within 6 months. In other words, let's say I want the notes mean for all the comments between 01/01/2020 and 30/06/2020 and also between 01/07/2020 and 31/12/2020.
You get the idea :)
I also would like to know the number of comments within 6 months.
But I suppose this is quite the same process.
Here’s some row of my database :
Here’s how I obtained her with web scraping :
import re
import json
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import datetime
import time
import random
root_url = 'https://fr.trustpilot.com/review/www.gammvert.fr'
urls = [ '{root}?page={i}'.format(root=root_url, i=i) for i in range(1,807) ]
comms = []
notes = []
dates = []
for url in urls:
results = requests.get(url)
time.sleep(20)
soup = BeautifulSoup(results.text, "html.parser")
commentary = soup.find_all('section', class_='review__content')
for container in commentary:
try:
comm = container.find('p', class_ = 'review-content__text').text.strip()
except:
comm = container.find('a', class_ = 'link link--large link--dark').text.strip()
comms.append(comm)
note = container.find('div', class_ = 'star-rating star-rating--medium').find('img')['alt']
notes.append(note)
date_tag = container.div.div.find("div", class_="review-content-header__dates")
date = json.loads(re.search(r"({.*})", str(date_tag)).group(1))["publishedDate"]
dates.append(date)
data = pd.DataFrame({
'comms' : comms,
'notes' : notes,
'dates' : dates
})
data['comms'] = data['comms'].str.replace('\n', '')
data['dates'] = pd.to_datetime(data['dates']).dt.date
data['dates'] = pd.to_datetime(data['dates'])
#print(data.head())
data.to_csv('file.csv', sep=';', index=False)
Here’s the function I used to obtained my comms_clean and month:
def clean_text(text):
text = tokenizer.tokenize(text)
text = nltk.pos_tag(text)
text = [word for word,pos in text if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS')
]
text = [word for word in text if not word in stop_words]
text = [word for word in text if len(word) > 2]
final_text = ' '.join( [w for w in text if len(w)>2] ) #remove word with one letter
return final_text
data['comms_clean'] = data['comms'].apply(lambda x : clean_text(x))
data['month'] = data.dates.dt.strftime('%Y-%m')
I suppose we can get that with .dt in datetime packages but I didn't find out how. Do you have any idea or indication in order to get that ?
Thank you :)

Object of type 'NoneType' has no len() error

What I want is to count the occurrence of a specific word on goole first page result site, and then once again count for another word - if this word appears more than 2 times, then I will change the occurrence of the first word to 0. But I get this error:
File "D:\HQ_Bot-master\answer_bot.py", line 307, in
get_points_live()
File "D:\HQ_Bot-master\answer_bot.py", line 293, in get_points_live
points,maxo = google_wiki(simq, options, neg)
File "D:\HQ_Bot-master\answer_bot.py", line 242, in google_wiki
count2 = len(words2)
TypeError: object of type 'NoneType' has no len()
Here is my code:
import string
import requests
import json
import urllib.request as urllib2
from bs4 import BeautifulSoup
from google import google
from PIL import Image
import pytesseract
import argparse
import cv2
import os
import pyscreenshot as Imagegrab
import sys
import wx
from halo import Halo
def google_wiki(sim_ques, options, neg):
spinner = Halo(text='Googling and searching Wikipedia', spinner='dots2')
spinner.start()
num_pages = 1
points = list()
content = ""
maxo=""
maxp=-sys.maxsize
i = 0
temp = 0
ques = ""
translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
sim_ques22 = sim_ques.translate(translator)
while i < 3:
o = options[i]
if i <= 1:
x = options[i+1]
else:
x = options[i-1]
o = o.lower()
x = x.lower()
ques += sim_ques22 + ' ' + o + ' wiki'
print(ques)
page = requests.get("http://www.google.com/search?q="+ques)
soup = BeautifulSoup(page.text,"lxml")
words = soup.find(text=lambda text: text and o in text)
if(type(words)is not None):
count = len(words)
words2 = soup.find(text=lambda text: text and x in text)
if(type(words)is not None):
count2 = len(words2)
if count2 >= 2:
temp = 0
else:
temp = count
if neg:
temp*=-1
points.append(temp)
if temp>maxp:
maxp=temp
maxo=o
ques = ""
i += 1
spinner.succeed()
spinner.stop()
return points,maxo
You can use a simple ternary statement:
count = len(words) if words else 0
which is the same as this
if words: # This checks if it is truthy (which None is not)
count = len(words)
else:
count = 0
If you want, you can swap the conditional for if words is None.
EDIT: I used a ternary expression as you use the variable later on. Otherwise, you'll end up with a NameError.
just use try and except if at all you want to continue without the error or catch the error and print if required
try:
// your code where you got the error
except:
pass
// or print the error caught if you want

How to find and match each elements of a list on each sentences?

I have a file including some sentences. I used polyglot for Named Entity Recognition and stored all detected entities in a list. Now I want to check if in each sentence any or pair of entities exist, show that for me.
Here what I did:
from polyglot.text import Text
file = open('input_raw.txt', 'r')
input_file = file.read()
test = Text(input_file, hint_language_code='fa')
list_entity = []
for sent in test.sentences:
#print(sent[:10], "\n")
for entity in test.entities:
list_entity.append(entity)
for i in range(len(test)):
m = test.entities[i]
n = test.words[m.start: m.end] # it shows only word not tag
if str(n).split('.')[-1] in test: # if each entities exist in each sentence
print(n)
It gives me an empty list.
Input:
sentence1: Bill Gate is the founder of Microsoft.
sentence2: Trump is the president of USA.
Expected output:
Bill Gate, Microsoft
Trump, USA
Output of list_entity:
I-PER(['Trump']), I-LOC(['USA'])
How to check if I-PER(['Trump']), I-LOC(['USA']) is in first sentence?
For starters you were adding the whole text file input to the entities list.
entities can only be called by each sentence in the polyglot object.
from polyglot.text import Text
file = open('input_raw.txt', 'r')
input_file = file.read()
file = Text(input_file, hint_language_code='fa')
list_entity = []
for sentence in file.sentences:
for entity in sentence.entities:
#print(entity)
list_entity.append(entity)
print(list_entity)
Now you don't have an empty list.
As for your problem with identifying the identity terms,
I have not found a way to generate an entity by hand, so the following simply checks if there are entities with the same term. A Chunk can have multiple strings inside, so we can go through them iteratively.
from polyglot.text import Text
file = open('input_raw.txt', 'r')
input_file = file.read()
file = Text(input_file, hint_language_code='ar')
def check_sentence(entities_list, sentence): ## Check if string terms
for term in entities_list: ## are in any of the entities
## Compare each Chunk in the list to each Chunk
## object in the sentence and see if there's any matches.
if any(any(entityTerm == term for entityTerm in entityObject)
for entityObject in sentence.entities):
pass
else:
return False
return True
sentence_number = 1 # Which sentence to check
sentence = file.sentences[sentence_number]
entity_terms = ["Bill",
"Gates"]
if check_sentence(entity_terms, sentence):
print("Entity Terms " + str(entity_terms) +
" are in the sentence. '" + str(sentence)+ "'")
else:
print("Sentence '" + str(sentence) +
"' doesn't contain terms" + str(entity_terms ))
Once you find a way to generate arbitrary entities all you'll have to do is stop popping the term from the sentence checker so you can do type comparison as well.
If you just want to match the list of entities in the file against a specific sentence, then this should do the trick:
from polyglot.text import Text
file = open('input_raw.txt', 'r')
input_file = file.read()
file = Text(input_file, hint_language_code='fa')
def return_match(entities_list, sentence): ## Check if and which chunks
matches = [] ## are in the sentence
for term in entities_list:
## Check each list in each Chunk object
## and see if there's any matches.
for entity in sentence.entities:
if entity == term:
for word in entity:
matches.append(word)
return matches
def return_list_of_entities(file):
list_entity = []
for sentence in file.sentences:
for entity in sentence.entities:
list_entity.append(entity)
return list_entity
list_entity = return_list_of_entities(file)
sentence_number = 1 # Which sentence to check
sentence = file.sentences[sentence_number]
match = return_match(list_entity, sentence)
if match:
print("Entity Term " + str(match) +
" is in the sentence. '" + str(sentence)+ "'")
else:
print("Sentence '" + str(sentence) +
"' doesn't contain any of the terms" + str(list_entity))

How do I extract the text part from <class 'generator'> part of an HTML tag while using a web-crawler

import requests
from bs4 import BeautifulSoup
import operator
from collections
import Counter
def start(url):
wordlist=[]
source_code=requests.get(url).text
soup=BeautifulSoup(source_code,'html.parser')
for each_text in soup.findAll('div',{'class':'entry-content'}):
content=each_text.strings
words=content.lower().split()
for each_word in words:
wordlist.append(each_word)
clean_wordlist(wordlist)
def clean_wordlist(wordlist):
clean_list=[]
for word in wordlist:
symbols='!##$%^&*()_-+={[}]|\;:"<>?/.,'
for i in range (0,len(symbols)):
word=word.replace(symbols[i],'')
if len(word)>0:
clean_list.append(word)
create_dictionary(clean_list)
def create_dictionary(clean_list):
word_count={}
for word in clean_list:
if word in word_count:
word_count[word]+=1
else:
word_count[word]=1
for key,value in sorted(word_count.items(),key=operator.itemgetter(1)):
print ("%s : %s " % (key,value))
c=Counter(word_count)
top=c.most_common(3)
print(top)
start("https://www.geeksforgeeks.org/programming-language-choose/")</code>
The following program gives the error "Attribute Error": "Generator" object has no attribute .lower().
I printed out the type of each_text.strings was returning which printed [class 'generator'] but now how do I move forward and get the text part from the given link
Instead of creating a generator object we just use .text or if we really wanted to use .strings you could then do unpacking (i.e. print(*stingsobject))
As you can tell we use the asterisk before the object to unpack it, I'll not go into details but you can find more about it HERE
import requests
from bs4 import BeautifulSoup
import operator
from collections import Counter
def start(url):
wordlist = []
source_code = requests.get(url).text
soup = BeautifulSoup(source_code, 'html.parser')
for each_text in soup.findAll('div', {'class': 'entry-content'}):
content = each_text.text
words = content.lower().split()
for each_word in words:
wordlist.append(each_word)
clean_wordlist(wordlist)
def clean_wordlist(wordlist):
clean_list = []
for word in wordlist:
symbols = '!##$%^&*()_-+={[}]|\;:"<>?/.,'
for i in range(0, len(symbols)):
word = word.replace(symbols[i], '')
if len(word) > 0:
clean_list.append(word)
create_dictionary(clean_list)
def create_dictionary(clean_list):
word_count = {}
for word in clean_list:
if word in word_count:
word_count[word] += 1
else:
word_count[word] = 1
for key, value in sorted(word_count.items(), key=operator.itemgetter(1)):
print("%s : %s " % (key, value))
c = Counter(word_count)
top = c.most_common(3)
print(top)
start("https://www.geeksforgeeks.org/programming-language-choose/")

Loading a classifier using Pickle?

I am trying run a sentiment analysis. I have managed to use Naive Bayes through nltk to classify a corpus of negative and positive tweets. However I do not want to go through the process of running this classifier every time I run this program so I tried to use pickle to save, and then load into a different script the classifier. However when I try to run the script it returns the error NameError: name classifier is not defined, although I thought it was defined through the def load_classifier():
The code I have atm is below:
import nltk, pickle
from nltk.corpus import stopwords
customstopwords = ['']
p = open('xxx', 'r')
postxt = p.readlines()
n = open('xxx', 'r')
negtxt = n.readlines()
neglist = []
poslist = []
for i in range(0,len(negtxt)):
neglist.append('negative')
for i in range(0,len(postxt)):
poslist.append('positive')
postagged = zip(postxt, poslist)
negtagged = zip(negtxt, neglist)
taggedtweets = postagged + negtagged
tweets = []
for (word, sentiment) in taggedtweets:
word_filter = [i.lower() for i in word.split()]
tweets.append((word_filter, sentiment))
def getwords(tweets):
allwords = []
for (words, sentiment) in tweets:
allwords.extend(words)
return allwords
def getwordfeatures(listoftweets):
wordfreq = nltk.FreqDist(listoftweets)
words = wordfreq.keys()
return words
wordlist = [i for i in getwordfeatures(getwords(tweets)) if not i in stopwords.words('english')]
wordlist = [i for i in getwordfeatures(getwords(tweets)) if not i in customstopwords]
def feature_extractor(doc):
docwords = set(doc)
features = {}
for i in wordlist:
features['contains(%s)' % i] = (i in docwords)
return features
training_set = nltk.classify.apply_features(feature_extractor, tweets)
def load_classifier():
f = open('my_classifier.pickle', 'rb')
classifier = pickle.load(f)
f.close
return classifier
while True:
input = raw_input('I hate this film')
if input == 'exit':
break
elif input == 'informfeatures':
print classifier.show_most_informative_features(n=30)
continue
else:
input = input.lower()
input = input.split()
print '\nSentiment is ' + classifier.classify(feature_extractor(input)) + ' in that sentence.\n'
p.close()
n.close()
Any help would be great, the script seems to make it to the print '\nSentiment is ' + classifier.classify(feature_extractor(input)) + ' in that sentence.\n'" before returning the error...
Well, you have declared and defined the load_classifier() method but never called it and assigned a variable using it. That means, by the time, the execution reaches the print '\nSentiment is... ' line, there is no variable names classifier. Naturally, the execution throws an exception.
Add the line classifier = load_classifier() just before while loop. (without any indentation)

Categories

Resources