Need help in creating a csv output file using pandas. I am only getting one line of data in my csv output file and there should be five. I have tried adding print statements in various places to check if my input csv file is iterating and it is. Not sure why its not storing all five results in the sentiment.csv file. There are 5 urls in the urls2.csv file so one would expect 5 results in the sentiment.csv but I am only getting one. Any help would be greatly appreciated.
import spacy
from spacytextblob.spacytextblob import SpacyTextBlob
import pandas as pd
from bs4 import BeautifulSoup
import requests
nlp = spacy.load('en_core_web_sm')
nlp.add_pipe('spacytextblob')
df = pd.read_csv("urls2.csv")
urls = df["Address"].tolist()
url_sent_score = []
url_sent_label = []
total_pos = []
total_neg = []
for count, x in enumerate(urls):
url = x
headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'}
res = requests.get(url,headers=headers)
html_page = res.text
soup = BeautifulSoup(html_page, 'html.parser')
for script in soup(["script", "style","meta","label","header","footer"]):
script.decompose()
page_text = (soup.get_text()).lower()
page_text = page_text.strip().replace(" ","")
page_text = "".join([s for s in page_text.splitlines(True) if s.strip("\r\n")])
doc = nlp(page_text)
sentiment = doc._.blob.polarity
sentiment = round(sentiment,2)
if sentiment > 0:
sent_label = "Positive"
else:
sent_label = "Negative"
url_sent_label.append(sent_label)
url_sent_score.append(sentiment)
positive_words = []
negative_words = []
for x in doc._.blob.sentiment_assessments.assessments:
if x[1] > 0:
positive_words.append(x[0][0])
elif x[1] < 0:
negative_words.append(x[0][0])
else:
pass
total_pos.append(', '.join(set(positive_words)))
total_neg.append(', '.join(set(negative_words)))
data = {
'Sentiment Score':url_sent_score,
'Sentiment Label':url_sent_label,
'Positive Words':total_pos,
'Negative Words':total_neg}
df=pd.DataFrame(data)
df.to_csv("sentiment.csv")
df.to_json("sentiment.json",orient="split")
The function .to_csv writes a file with a default field delimiter is comma (,), see this documentation (https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html). You should change the field delimiter of your file to a line, like this:
df.to_csv("sentiment.csv",sep="\n")
where "\n" writes each row of the dataframe in a new line.
Related
So Im learning more about python everyday. Im doing a mini web scrape project and at the very end when I should see the results on an exported csv - it comes up blank except for the headers. Any help is gladly appreciated! Thanks.
The code is below:
import csv
import requests
from bs4 import BeautifulSoup
url = "https://www.boxofficemojo.com/year/"
response = requests.get(url)
html = response.content
soup = BeautifulSoup(html, "html.parser")
box_office_table = soup.find("div", class_="a-section mojo-body aok-relative").find_all("tr")
with open('imdbmovies.csv', 'a', newline='') as csvfile:
writer = csv.writer(csvfile)
# Write headers to CSV file
writer.writerow(['numone_release', 'year', 'total_gross', 'releases', 'average', 'gross_change'])
for row in box_office_table:
try:
year_cell = row.find("td", class_="a-text-left mojo-header-column mojo-field-type-year mojo-sort-column")
money_cells = row.find_all("td", class_="a-text-right mojo-field-type-money")
releases_cell = row.find("td", class_="a-text-right mojo-field-type-positive_integer")
gross_change_cell = row.find("td", class_="a-text-right mojo-number-delta mojo-field-type-percent_delta")
numone_release_cell = row.find("td", class_="a-text-left mojo-field-type-release mojo-cell-wide")
if len(money_cells) >= 2 and year_cell is not None and releases_cell is not None and gross_change_cell is not None and numone_release_cell is not None:
total_gross_cell = money_cells[0]
average_cell = money_cells[1]
year = year_cell.text.strip()
total_gross = total_gross_cell.text.strip()
releases = releases_cell.text.strip()
average = average_cell.text.strip()
gross_change = gross_change_cell.text.strip()
numone_release = numone_release_cell.text.strip()
print(year, total_gross, releases, average, gross_change, numone_release)
# Write the row to the CSV file
writer.writerow([numone_release, year, total_gross, releases, average, gross_change])
except AttributeError:
# Either a cell is not found
pass
I had a problem when I took out html files and imported them into excel.
This is the site i need to get information: https://www.kylc.com/stats/global/yearly_per_country/g_gdp/vnm.html
As you can see, in the GDP table I have a row named : 年份 separated from 2 lines
That's why after i exported the excel file it gave unexpected results
The result I want is that the first line in excel will only have : 年份 , GDP(美元), 占世界%
Sorry for my confusing explanation, I really don't know how to explain it in detail.
Here is my python code
import requests
from bs4 import BeautifulSoup
import lxml
import csv
def get_html(url):
try:
r = requests.get(url)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
r = "fail"
return r
def getGDP(ulist,html):
soup = BeautifulSoup(html,"html.parser")
trs = soup.find_all('tr')
for tr in trs:
list = []
for th in tr:
ts = th.string
if ts == '\n':
continue
list.append(ts)
ulist.append(list)
def saveGDP(ulist):
file_name = '21095010 胡碧玉 GDP.csv'
with open(file_name,'w',errors='ignore',newline='') as f:
f_csv = csv.writer(f)
f_csv.writerows(ulist)
def main():
unifo=[]
url='https://www.kylc.com/stats/global/yearly_per_country/g_gdp/vnm.html'
html=get_html(url)
getGDP(unifo,html)
saveGDP(unifo)
if __name__=="__main__":
main()
Thank you so much!
Using pandas scraping tables and cleaning of results in most cases is mutch easier - under the hood beautifulsoup is working for you.
In this case read_html() the table, drop the unwanted header level and filter out the rows containings ads:
import pandas as pd
df = pd.read_html('https://www.kylc.com/stats/global/yearly_per_country/g_gdp/vnm.html')[0].droplevel(0, axis=1)
df[~df.iloc[:,0].str.contains('ads')].to_csv('21095010 胡碧玉 GDP.csv', index=False)
Answering your question
You have to select your elements more specific e.g. with css selectors.
So first get the thead information from all th witout colspan, than collect the data from all tr in tbody that do not contains ads:
def getGDP(html):
soup = BeautifulSoup(html,"html.parser")
data = []
data.append([th.text for th in soup.select('thead th:not([colspan])')])
for row in soup.select('tbody tr:not(:-soup-contains("ads"))'):
data.append(list(row.stripped_strings))
return data
Example
import requests
from bs4 import BeautifulSoup
import lxml
import csv
def get_html(url):
try:
r = requests.get(url)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
r = "fail"
return r
def getGDP(html):
soup = BeautifulSoup(html,"html.parser")
data = []
data.append([th.text for th in soup.select('thead th:not([colspan])')])
for x in soup.select('tbody tr:not(:-soup-contains("ads"))'):
data.append(list(x.stripped_strings))
return data
def saveGDP(ulist):
file_name = '21095010 胡碧玉 GDP.csv'
print(ulist)
with open(file_name,'w',errors='ignore', encoding='utf-8') as f:
f_csv = csv.writer(f)
f_csv.writerows(ulist)
def main():
url='https://www.kylc.com/stats/global/yearly_per_country/g_gdp/vnm.html'
html=get_html(url)
saveGDP(getGDP(html))
if __name__=="__main__":
main()
When I run the code below, the for loop saves the first text correctly into a separate file, but the second iteration saves the first AND the second into another separate file, and the third iteration saves the first, second and third into a separate file and so on.... I'd like to save each iteration into a separate file but not adding the previous iterations. I don't have a clue to what I'm missing here. Can anyone help, please?
import requests
from bs4 import BeautifulSoup
import pandas as pd
base_url = 'http://www.chakoteya.net/StarTrek/'
end_url = ['1.htm', '6.htm', '8.htm', '2.htm', '7.htm',
'5.htm', '4.htm', '10.htm', '12.htm', '11.htm', '3.htm', '16.htm']
episodes = []
count = 0
for end_url in end_url:
url = base_url + end_url
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
episodes.append(soup.text)
file_text = open(f"./{count}.txt", "w")
file_text.writelines()
file_text.close()
count = count + 1
print(f"saved file for url:{url}")
Please consider the following points!
there's no reason at all to use bs4! since response.text is actually holding the same.
You've to use Same Session explained on my previous answer
You can use iteration with fstring/format which will let your code more cleaner and easier to read.
with context manager is less headache as you don't need to remember to close your file after!
import requests
block = [9, 13, 14, 15]
def main(url):
with requests.Session() as req:
for page in range(1, 17):
if page not in block:
print(f'Extracing Page# {page}')
r = req.get(url.format(page))
with open(f'{page}.htm', 'w') as f:
f.write(r.text)
main('http://www.chakoteya.net/StarTrek/{}.htm')
You needed to empty your episodes for each iteration. Try the following:
import requests
from bs4 import BeautifulSoup
import pandas as pd
base_url = 'http://www.chakoteya.net/StarTrek/'
end_url = ['1.htm', '6.htm', '8.htm', '2.htm', '7.htm',
'5.htm', '4.htm', '10.htm', '12.htm', '11.htm', '3.htm', '16.htm']
count = 0
for end_url in end_url:
episodes = []
url = base_url + end_url
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
episodes.append(soup.text)
file_text = open(f"./{count}.txt", "w")
file_text.writelines(episodes)
file_text.close()
count = count + 1
print(f"saved file for url:{url}")
It doesn't appear that your code would save anything to the files at all as you are calling writelines with no arguments
if __name__ == '__main__':
import requests
from bs4 import BeautifulSoup
base_url = 'http://www.chakoteya.net/StarTrek/'
paths = ['1.htm', '6.htm', '8.htm', '2.htm', '7.htm',
'5.htm', '4.htm', '10.htm', '12.htm', '11.htm', '3.htm', '16.htm']
for path in paths:
url = f'{base_url}{path}'
filename = path.split('.')[0]
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
with open(f"./{filename}.txt", "w") as f:
f.write(soup.text)
print(f"saved file for url:{url}")
This is reworked a little. It wasn't clear why the data was appending to episodes so that was left off.
Maybe you were writing the list to the file which would account for dupes. You were adding the content to each file to a list and writing that growing list each iteration.
import re
filename = "access.log"
path = ""
with open (path + filename, "r") as logfile:
count = 0
for line in logfile: # Loops through the log file
regex = ('(?:(GET|POST) )(\S+)') # Stores the regex
url = re.findall(regex, line) # Uses the findall method and stores it in url variable
print(url[0][1]) # Prints out a list of URLs
This is an example of the log file
access.log
209.160.24.63 - - [01/Feb/2021:18:22:17] "GET /product.screen?productId=BS-AG-G09&JSESSIONID=SD0SL6FF7ADFF4953 HTTP 1.1" 200 2550 "http://www.google.com/productid=12wdef" "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.46 Safari/536.5" 422
I got the URL in bold but I want to split it up now and store it in a dict in python.
Since you already got the bolded string, you can just split it by the first whitespace that occurs in the string
s = "GET /product.screen?productId=BS-AG-G09&JSESSIONID=SD0SL6FF7ADFF4953"
s.split(" ", 1)
should return
['GET', '/product.screen?productId=BS-AG-G09&JSESSIONID=SD0SL6FF7ADFF4953']
You can just transform the data accordingly after.
import re
filename = "access.log"
dictionary = {}
list_resources = []
count = 0
with open (filename, "r") as logfile:
for line in logfile: # Loops through the log file
regex = ('(?:(GET|POST) )(\S+)') # Stores the regex
url = re.findall(regex, line)[0][1] # Uses the findall method and stores it in url variable
list_resources.append(url)
resource = re.split("\?", url)[0]
parameters = re.split("\?", url)[1]
parameter = re.split("&", parameters)
param_dict = {}
for i in parameter:
key = re.split('=', i)[0]
value = re.split('=', i)[1]
param_dict[key] = value
dictionary[count] = {'resource': resource, 'parameters': param_dict}
count += 1
# print(list_resources)
print(dictionary)
Figured what I wanted to do, to split up the URL and store the resource and parameters in a dictionary.
I'm attempting to scrape data for all the quarterbacks who have been drafted. http://www.nfl.com/draft/history/fulldraft?type=position
I'm able to scrape the data. However, there are blank lines that I cannot get rid of. Excel file output
Here is the code that I used.
import urllib
import urllib.request
from bs4 import BeautifulSoup
import os
def make_soup(url):
thepage = urllib.request.urlopen(url)
soupdata = BeautifulSoup(thepage, "html.parser")
return soupdata
playerdata = playerdatasaved = ""
soup = make_soup("http://www.nfl.com/draft/history/fulldraft?type=position")
for record in soup.findAll('tr'):
playerdata = ""
for data in record.findAll('td'):
playerdata = playerdata + "," + data.text
if len(playerdata)!= 0:
playerdatasaved = playerdatasaved + "\n" + playerdata[1:]
header = "Round, Selection #, Player, Position, School, Team Drafted" + "\n"
file = open("Quarterbacks.csv","wb")
file.write(bytes(header, encoding = "ascii", errors = 'igonore'))
file.write(bytes(playerdatasaved, encoding = "ascii", errors = 'igonore'))
I've tried to use an if statement to check for \n breaks and remove the breaks. Also, I've tried to turn the data into a string and use a replace or split command. None of these corrected the issue.
Thanks for any help that you can give me!