Pandas merge dataframes on same column - python

I am writing a scraper and I want loop through a list of links and merge all results as columns to a dataframe on same key (like a left join).
I run this code in the Ipython Notebook, the resulting csv that comes from the dataframe does not make sense, however if after running the script I merge df and df2 on the mutual column "questions", I get the join that I need, but in the script there's something wrong.
Here's the whole script, there's log in with requests but you don't have to make a user, you can run it as a guest and you just won't get all answers in the review.
import requests
from bs4 import BeautifulSoup as bs
import csv
import pandas as pd
get_url = 'https://www.g2crowd.com/login?form=login'
post_url = 'https://www.g2crowd.com/user_sessions'
review_url = 'https://www.g2crowd.com/survey_responses/salesforce-crm-review-29972'
links = []
with open("links.csv", "r") as f:
spamreader = csv.reader(f, delimiter=',')
for row in spamreader:
links.append(row)
links = links[1:]
s = requests.Session()
r = s.get(get_url)
soup = bs(r.text)
token = soup.select('input[name="authenticity_token"]')[0]['value']
username = 'email#gmail.com'
password = 'password'
payload = {"user_session[login_email]": "email#gmail.com", "user_session[password]": "password"}
payload['authenticity_token'] = token
Referer = dict(Referer=get_url)
r = s.post(post_url, data=payload, headers=Referer)
print r.status_code
df = pd.read_csv("data.csv")
#df = df.set_index('questions')
for link in links:
r = s.get(link[0])
soup = bs(r.text)
title = soup.title.contents[0]
question_wrapper = soup.findAll("div", class_="question-wrapper")
print len(question_wrapper)
questions = []
answers = []
scraped_titles = []
tricky_title = 'Salesforce CRM Review by G2 Crowd User in Transportation/Trucking/Railroad - March 13, 2013'
with open("scraped_titles.csv", "r") as f:
spamreader = csv.reader(f, delimiter=',')
for row in spamreader:
scraped_titles.append(row[0])
scraped_titles = set(scraped_titles)
if (title not in scraped_titles and title != tricky_title):
for question in question_wrapper:
q = question.label.contents[0]
a = question.div.contents[0].text
questions.append(q)
answers.append(a)
#qa = zip(questions, answers)
qa = dict(questions=questions, answers=answers)
df2 = pd.DataFrame(qa)
#df2 = df2.set_index('questions', inplace=True)
#df2.to_csv(title + ".csv", encoding='utf-8')
df = pd.merge(df, df2, how='left', on='questions')
with open("scraped_titles.csv", "a") as csvwriter:
spamreader = csv.writer(csvwriter, delimiter=',')
spamreader.writerow([unicode(title).encode("utf-8")])
else:
pass
df.to_csv("all_data.csv", encoding='utf-8')
I also tried to save every review to .csv and then merge everything with Pandas, but I get a weird, rare undocumented error:
Error: new-line character seen in unquoted field - do you need to open
the file in universal-newline mode?
I have been trying to find my error for a pretty while, if somebody can point it out it would be extremely helpful.
Also, I hope I have formatted the post according to the rules, if not please help me to correct it.

Related

CSV file being exported empty and only the headers are showing?

So Im learning more about python everyday. Im doing a mini web scrape project and at the very end when I should see the results on an exported csv - it comes up blank except for the headers. Any help is gladly appreciated! Thanks.
The code is below:
import csv
import requests
from bs4 import BeautifulSoup
url = "https://www.boxofficemojo.com/year/"
response = requests.get(url)
html = response.content
soup = BeautifulSoup(html, "html.parser")
box_office_table = soup.find("div", class_="a-section mojo-body aok-relative").find_all("tr")
with open('imdbmovies.csv', 'a', newline='') as csvfile:
writer = csv.writer(csvfile)
# Write headers to CSV file
writer.writerow(['numone_release', 'year', 'total_gross', 'releases', 'average', 'gross_change'])
for row in box_office_table:
try:
year_cell = row.find("td", class_="a-text-left mojo-header-column mojo-field-type-year mojo-sort-column")
money_cells = row.find_all("td", class_="a-text-right mojo-field-type-money")
releases_cell = row.find("td", class_="a-text-right mojo-field-type-positive_integer")
gross_change_cell = row.find("td", class_="a-text-right mojo-number-delta mojo-field-type-percent_delta")
numone_release_cell = row.find("td", class_="a-text-left mojo-field-type-release mojo-cell-wide")
if len(money_cells) >= 2 and year_cell is not None and releases_cell is not None and gross_change_cell is not None and numone_release_cell is not None:
total_gross_cell = money_cells[0]
average_cell = money_cells[1]
year = year_cell.text.strip()
total_gross = total_gross_cell.text.strip()
releases = releases_cell.text.strip()
average = average_cell.text.strip()
gross_change = gross_change_cell.text.strip()
numone_release = numone_release_cell.text.strip()
print(year, total_gross, releases, average, gross_change, numone_release)
# Write the row to the CSV file
writer.writerow([numone_release, year, total_gross, releases, average, gross_change])
except AttributeError:
# Either a cell is not found
pass

csv.writer not writing entire output to CSV file

I am attempting to scrape the artists' Spotify streaming rankings from Kworb.net into a CSV file and I've nearly succeeded except I'm running into a weird issue.
The code below successfully scrapes all 10,000 of the listed artists into the console:
import requests
from bs4 import BeautifulSoup
import csv
URL = "https://kworb.net/spotify/artists.html"
result = requests.get(URL)
src = result.content
soup = BeautifulSoup(src, 'html.parser')
table = soup.find('table', id="spotifyartistindex")
header_tags = table.find_all('th')
headers = [header.text.strip() for header in header_tags]
rows = []
data_rows = table.find_all('tr')
for row in data_rows:
value = row.find_all('td')
beautified_value = [dp.text.strip() for dp in value]
print(beautified_value)
if len(beautified_value) == 0:
continue
rows.append(beautified_value)
The issue arises when I use the following code to save the output to a CSV file:
with open('artist_rankings.csv', 'w', newline="") as output:
writer = csv.writer(output)
writer.writerow(headers)
writer.writerows(rows)
For whatever reason, only 738 of the artists are saved to the file. Does anyone know what could be causing this?
Thanks so much for any help!
As an alternative approach, you might want to make your life easier next time and use pandas.
Here's how:
import requests
import pandas as pd
source = requests.get("https://kworb.net/spotify/artists.html")
df = pd.concat(pd.read_html(source.text, flavor="bs4"))
df.to_csv("artists.csv", index=False)
This outputs a .csv file with 10,000 artists.

Write.CSV to a folder showing file not defined error

I have a script which produces multiple .csv files and each .csv file has its own name which is a variable. I am trying to save these files to a specific path instead of saving them to the Python folder.
I have tried this tutorial Specify path in write.csv function but it gave me this error: NameError: name 'file' is not defined and I tried to find other people who had the same issue when using write.csv but was unable to find any
I am on MacOS
Here is the code:
path = '/Users/chris/Desktop/cd'
fcsv = csv.writer(open, file.path(f'{finalitem}.csv', 'w', newline=''))
fcsv.writerow(headers)
fcsv.writerows(datarows)
I have tried multiple examples of writing csv to file path and have had 0 success. If anyone has any ideas or suggestions I'd love to hear them.
Here is my full code:
import csv
import requests
from bs4 import BeautifulSoup as bs
from datetime import datetime
headers = []
datarows = []
# define 1-1-2020 as a datetime object
after_date = datetime(2020, 1, 1)
with requests.Session() as s:
s.headers = {"User-Agent": "Safari/537.36"}
r = s.get('https://bitinfocharts.com/top-100-richest-dogecoin-addresses-20.html')
soup = bs(r.content, 'lxml')
# select all tr elements (minus the first one, which is the header)
table_elements = soup.select('tr')[1:]
address_links = []
for element in table_elements:
children = element.contents # get children of table element
url = children[1].a['href']
last_out_str = children[8].text
# check to make sure the date field isn't empty
if last_out_str != "":
# load date into datetime object for comparison (second part is defining the layout of the date as years-months-days hour:minute:second timezone)
last_out = datetime.strptime(last_out_str, "%Y-%m-%d %H:%M:%S %Z")
# if check to see if the date is after 2020/1/1
if last_out > after_date:
address_links.append(url)
for url in address_links:
r = s.get(url)
soup = bs(r.content, 'lxml')
table = soup.find(id="table_maina")
#Get the Doge Address for the filename
item = soup.find('h1').text
newitem = item.replace('Dogecoin', '')
finalitem = newitem.replace('Address', '')
finalitem = finalitem.replace(' ', '')
#Get the profit
sections = soup.find_all(class_='table-striped')
for section in sections:
oldprofit = section.find_all('td')[11].text
removetext = oldprofit.replace('USD', '')
removetext = removetext.replace(' ', '')
removetext = removetext.replace(',', '')
profit = float(removetext)
# Compare profit to goal
goal = float(50000)
if profit < goal:
continue
if table:
for row in table.find_all('tr'):
heads = row.find_all('th')
if heads:
headers = [th.text for th in heads]
else:
datarows.append([td.text for td in row.find_all('td')])
path = '/Users/chris/Desktop/cd'
fcsv = csv.writer(open(f'{finalitem}.csv', 'w', newline=''))
fcsv.writerow(headers)
fcsv.writerows(datarows)
You can have automatic file closing using a with statement:
with open(f'{finalitem}.csv', 'w', newline='') as csvfile:
fcsv = csv.DictWriter(csvfile, fieldnames=headers) # assuming headers is a list object
fcsv.writeheader()
fcsv.writerows(datarows)
The error that you are having is that you are wrapping file.path() with elements that should be part of the open function and that you may be wanting to refer to the path variable instead of a path() function for your naming/downloading path.

How to iterate through all rows and save the data

I am working on a Python script to do some steps.
Right now everything is working except the part that it has to go through all table rows and save each line as a JSON object. The problem is: It only saves the last line. So it's not saving the previous lines. I know where is the problem but don't know how to fix.
Here is the code:
url = 'http://website.com/group1.html'
htmlFile= urlopen(urlGroup1)
soup= BeautifulSoup(htmlGroup1, 'html.parser')
table = soup2.find_all("table", {"class": "sortable employeeList result-table"})[0]
rows = table.find_all('tr')
Filegroup1 = open('group1.csv', 'wt+')
Datagroup1 = csv.writer(Filegroup1)
jsonFilePath = r'group1.json'
def make_json(Filegroup1, jsonFilePath):
data2 = {}
with open('group1.csv', encoding='utf-8') as csvf:
csvReader = csv.DictReader(csvf)
for rows in csvReader:
data2 = rows
with open(jsonFilePath, 'w', encoding='utf-8') as jsonf:
jsonf.write(simplejson.dumps([data2], indent=4))
try:
for row in rows:
FilteredRow = []
for cell in row.find_all(['td', 'th']):
FilteredRow.append(cell.get_text().strip())
Datagroup1.writerow(FilteredRow)
finally:
Filegroup1.close()
make_json(Filegroup1, jsonFilePath)
The issue is here:
for rows in csvReader:
data2 = rows
If I change it to the following, it will work!! But it will group each object by Employee ID. Which I don't want that.
for rows in csvReader:
key = rows['Employee Name']
data3[key] = rows

Write data into csv

I am crawling data from Wikipedia and it works so far. I can display it on the terminal, but I can't write it the way I need it into a csv file :-/
The code is pretty long, but I paste it here anyway and hope that somebody can help me.
import csv
import requests
from bs4 import BeautifulSoup
def spider():
url = 'https://de.wikipedia.org/wiki/Liste_der_Gro%C3%9F-_und_Mittelst%C3%A4dte_in_Deutschland'
code = requests.get(url).text # Read source code and make unicode
soup = BeautifulSoup(code, "lxml") # create BS object
table = soup.find(text="Rang").find_parent("table")
for row in table.find_all("tr")[1:]:
partial_url = row.find_all('a')[0].attrs['href']
full_url = "https://de.wikipedia.org" + partial_url
get_single_item_data(full_url) # goes into the individual sites
def get_single_item_data(item_url):
page = requests.get(item_url).text # Read source code & format with .text to unicode
soup = BeautifulSoup(page, "lxml") # create BS object
def getInfoBoxBasisDaten(s):
return str(s) == 'Basisdaten' and s.parent.name == 'th'
basisdaten = soup.find_all(string=getInfoBoxBasisDaten)[0]
basisdaten_list = ['Bundesland', 'Regierungsbezirk:', 'Höhe:', 'Fläche:', 'Einwohner:', 'Bevölkerungsdichte:',
'Postleitzahl', 'Vorwahl:', 'Kfz-Kennzeichen:', 'Gemeindeschlüssel:', 'Stadtgliederung:',
'Adresse', 'Anschrift', 'Webpräsenz:', 'Website:', 'Bürgermeister', 'Bürgermeisterin',
'Oberbürgermeister', 'Oberbürgermeisterin']
with open('staedte.csv', 'w', newline='', encoding='utf-8') as csvfile:
fieldnames = ['Bundesland', 'Regierungsbezirk:', 'Höhe:', 'Fläche:', 'Einwohner:', 'Bevölkerungsdichte:',
'Postleitzahl', 'Vorwahl:', 'Kfz-Kennzeichen:', 'Gemeindeschlüssel:', 'Stadtgliederung:',
'Adresse', 'Anschrift', 'Webpräsenz:', 'Website:', 'Bürgermeister', 'Bürgermeisterin',
'Oberbürgermeister', 'Oberbürgermeisterin']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL, extrasaction='ignore')
writer.writeheader()
for i in basisdaten_list:
wanted = i
current = basisdaten.parent.parent.nextSibling
while True:
if not current.name:
current = current.nextSibling
continue
if wanted in current.text:
items = current.findAll('td')
print(BeautifulSoup.get_text(items[0]))
print(BeautifulSoup.get_text(items[1]))
writer.writerow({i: BeautifulSoup.get_text(items[1])})
if '<th ' in str(current): break
current = current.nextSibling
print(spider())
The output is incorrect in 2 ways. The cells are their right places and only one city is written, all others are missing. It looks like this:
But it should look like this + all other cities in it:
'... only one city is written ...': You call get_single_item_data for each city. Then inside this function you open the output file with the same name, in the statement with open('staedte.csv', 'w', newline='', encoding='utf-8') as csvfile: which will overwrite the output file each time you call the function.
Each variable is written to a new row: In the statement writer.writerow({i: BeautifulSoup.get_text(items[1])}) you write the value for one variable to a row. What you need to do instead is to make a dictionary for values before you start looking for page values. As you accumulate the values from the page you shove them into the dictionary by field name. Then after you have found all of the values available you call writer.writerow.

Categories

Resources