How do I fix/prevent Data Overwriting Issue in Web Scrape Loop? - python

I was able to loop the web scraping process, but the data collected from the page that comes after replaces the data from the page before. Making the excel contains only the data from the last page. What do I need to do?
from bs4 import BeautifulSoup
import requests
import pandas as pd
print ('all imported successfuly')
for x in range(1, 44):
link = (f'https://www.trustpilot.com/review/birchbox.com?page={x}')
print (link)
req = requests.get(link)
content = req.content
soup = BeautifulSoup(content, "lxml")
names = soup.find_all('div', attrs={'class': 'consumer-information__name'})
headers = soup.find_all('h2', attrs={'class':'review-content__title'})
bodies = soup.find_all('p', attrs={'class':'review-content__text'})
ratings = soup.find_all('div', attrs={'class':'star-rating star-rating--medium'})
dates = soup.find_all('div', attrs={'class':'review-content-header__dates'})
print ('pass1')
df = pd.DataFrame({'User Name': names, 'Header': headers, 'Body': bodies, 'Rating': ratings, 'Date': dates})
df.to_csv('birchbox006.csv', index=False, encoding='utf-8')
print ('excel done')

Because you are using a loop the variables are being constantly overwritten. Normally what you'd do in a situation like this is have an array and then append to it throughout the loop:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import json
print ('all imported successfuly')
# Initialize an empty dataframe
df = pd.DataFrame()
for x in range(1, 44):
names = []
headers = []
bodies = []
ratings = []
published = []
updated = []
reported = []
link = (f'https://www.trustpilot.com/review/birchbox.com?page={x}')
print (link)
req = requests.get(link)
content = req.content
soup = BeautifulSoup(content, "lxml")
articles = soup.find_all('article', {'class':'review'})
for article in articles:
names.append(article.find('div', attrs={'class': 'consumer-information__name'}).text.strip())
headers.append(article.find('h2', attrs={'class':'review-content__title'}).text.strip())
try:
bodies.append(article.find('p', attrs={'class':'review-content__text'}).text.strip())
except:
bodies.append('')
try:
ratings.append(article.find('p', attrs={'class':'review-content__text'}).text.strip())
except:
ratings.append('')
dateElements = article.find('div', attrs={'class':'review-content-header__dates'}).text.strip()
jsonData = json.loads(dateElements)
published.append(jsonData['publishedDate'])
updated.append(jsonData['updatedDate'])
reported.append(jsonData['reportedDate'])
# Create your temporary dataframe of the first iteration, then append that into your "final" dataframe
temp_df = pd.DataFrame({'User Name': names, 'Header': headers, 'Body': bodies, 'Rating': ratings, 'Published Date': published, 'Updated Date':updated, 'Reported Date':reported})
df = df.append(temp_df, sort=False).reset_index(drop=True)
print ('pass1')
df.to_csv('birchbox006.csv', index=False, encoding='utf-8')
print ('excel done')

The reason is because you are overwriting your variables in each iteration.
If you want to extend this variables, you can do for example:
names = []
bodies = []
ratings = []
dates = []
for x in range(1, 44):
link = (f'https://www.trustpilot.com/review/birchbox.com?page={x}')
print (link)
req = requests.get(link)
content = req.content
soup = BeautifulSoup(content, "lxml")
names += soup.find_all('div', attrs={'class': 'consumer-information__name'})
headers += soup.find_all('h2', attrs={'class':'review-content__title'})
bodies += soup.find_all('p', attrs={'class':'review-content__text'})
ratings += soup.find_all('div', attrs={'class':'star-rating star-rating--medium'})
dates += soup.find_all('div', attrs={'class':'review-content-header__dates'})

You'll have to store that data after each iteration somewhere. Theres a few ways you can do it. You can just store everythin in a list, then create your dataframe. Or what I did is create a "temporary" dataframe that is created after each iteration, then append that into the final dataframe. Think of it like bailing water. You have a small bucket of water, to then empty into a large bucket, that will collect/hold all the water you are trying to collect.
from bs4 import BeautifulSoup
import requests
import pandas as pd
import json
print ('all imported successfuly')
# Initialize an empty dataframe
df = pd.DataFrame()
for x in range(1, 44):
published = []
updated = []
reported = []
link = (f'https://www.trustpilot.com/review/birchbox.com?page={x}')
print (link)
req = requests.get(link)
content = req.content
soup = BeautifulSoup(content, "lxml")
names = [ x.text.strip() for x in soup.find_all('div', attrs={'class': 'consumer-information__name'})]
headers = [ x.text.strip() for x in soup.find_all('h2', attrs={'class':'review-content__title'})]
bodies = [ x.text.strip() for x in soup.find_all('p', attrs={'class':'review-content__text'})]
ratings = [ x.text.strip() for x in soup.find_all('div', attrs={'class':'star-rating star-rating--medium'})]
dateElements = soup.find_all('div', attrs={'class':'review-content-header__dates'})
for date in dateElements:
jsonData = json.loads(date.text.strip())
published.append(jsonData['publishedDate'])
updated.append(jsonData['updatedDate'])
reported.append(jsonData['reportedDate'])
# Create your temporary dataframe of the first iteration, then append that into your "final" dataframe
temp_df = pd.DataFrame({'User Name': names, 'Header': headers, 'Body': bodies, 'Rating': ratings, 'Published Date': published, 'Updated Date':updated, 'Reported Date':reported})
df = df.append(temp_df, sort=False).reset_index(drop=True)
print ('pass1')
df.to_csv('birchbox006.csv', index=False, encoding='utf-8')
print ('excel done')

Related

Trying to export data taken from HTML using beautiful soup

I'm trying to extract information from the HTML texts that I get from URLs that I create from a For loop and then use beautiful soup.
I get to isolate the information correctly but when I'm trying to export the data I get an error message "All arrays must be of the same length"
weblink = []
filing_type = []
company_name = []
date = []
#Importing file
df = pd.read_csv('Downloads\Dropped_Companies.csv')
#Getting companie's names into list
companies_column=list(df.columns.values)[4]
name_ = df[companies_column].tolist()
#Formatting company's names for creating URLs
for CompanyName in name_:
company_name.append(CompanyName.lower().replace(" ",'_'))
company_name
for item in range(0, len(company_name)):
link = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&company=' + company_name[item] + '&type=10-K&dateb=&owner=exclude&count=100'
#Getting the HTML text
headers = random.choice(headers_list)
r = requests.Session()
r.headers = headers
html = r.get(link).text
#Calling beautiful soup for better HTML text
soup = bs.BeautifulSoup(html)
tet_ = soup.find_all("a", id = "documentsbutton")
#Get the links
for link in tet_:
weblink.append('https://www.sec.gov' + link.get('href'))
test11 = soup.find_all("table", class_= "tableFile2")
for link in test11:
row = link.find_all("td", nowrap = "nowrap")
for i in range(0, len(row), 3):
filing_type.append(row[i].getText())
date.append(link.find("td", class_ = "small").find_next_sibling("td").text)
name.append(company_name[item])
data = {'Company Name':name,'Filing Date': date,'Filing Type':filing_type,"Weblink":weblink}
outputdf = pd.DataFrame(data)
outputdf.to_csv('Downloads/t_10KLinks.csv')
data = {'Company Name':name,'Filing Date': date,'Filing Type':filing_type,"Weblink":weblink}
outputdf = pd.DataFrame.from_dict(data, orient='index')
outputdf.to_csv('Downloads/t_10KLinks.csv')
pandas.DataFrame.from_dict documentation.

Python scraper not writing correctly to csv

I'm new to python and programming, but having a bit of a problem with my project. I'm trying to scrape a website for data and save it in a csv. I works but when I'm writing the "lst" list to "Image URL" and "Image Featured" the brackets "[" and "]" and
'"' also get written to the csv file. is there a way to remove this? I know it´s because the "lst" list cointains other list with the urls.
import csv
from bs4 import BeautifulSoup
import requests
import pandas as pd
from datetime import date
today = date.today()
source = requests.get('https://www.meklarin.fo/').text
soup = BeautifulSoup(source, 'lxml')
df = pd.read_csv(r'C:\Users\username\Desktop\Kassin.fo\kassin\blog\management\commands\test.csv')
print(df.to_string())
original_house_title_list = []
original_house_link_list = []
house_titles_list = []
house_asking_price_list = []
house_current_bid_price_list = []
house_link_list = []
product = 'product'
current_date = today.strftime("%m.%d.%y")
house_image_list = []
house_location_list = []
lst = []
lst1 = []
house_info_list = []
house_final_info = []
list_convert = []
for house_link in soup.find_all('a', class_='house-air-content'):
house_link = house_link.get('href')
house_link_list.append(house_link.strip())
print(house_link.strip())
for house_link in house_link_list:
if house_link in original_house_link_list:
continue
else:
source = requests.get(house_link).text
soup = BeautifulSoup(source, 'lxml')
for house_titles in soup.find_all('div', class_='ogn-base-info'):
house_title = house_titles.h1.text
house_titles_list.append(house_title)
#print(house_title)
for house__asking_price in soup.find_all('div', class_='col-xs-12 col-sm-12 col-md-6 house-ask-price house-price-column'):
house_asking_price = house__asking_price.text
house_asking_price = str(house_asking_price)
house_asking_price = house_asking_price.removeprefix('Prísuppskotkr.')
house_asking_price = house_asking_price.replace('.','')
house_asking_price_list.append(house_asking_price.strip())
#print(house_asking_price.strip())
for house__current_bid_price in soup.find_all('div', class_='col-xs-12 col-sm-12 col-md-6 house-bid-price house-price-column'):
house_current_bid_price = house__current_bid_price.h3.text
house_current_bid_price = str(house_current_bid_price)
house_current_bid_price = house_current_bid_price.replace('.','')
house_current_bid_price = house_current_bid_price.replace('kr','')
house_current_bid_price_list.append(house_current_bid_price.strip())
print(house_current_bid_price.strip())
for house_all_images in soup.find_all('a'):
if 'https://www.meklarin.fo/wp-content/uploads' in str(house_all_images):
house_all_images = house_all_images.get('href')
house_image_list.append(house_all_images)
#print(house_all_images)
else:
continue
lst.append(house_image_list)
lst1.append(lst)
house_image_list=[]
for house_build_year in soup.find_all('div', class_='house-info-box-value'):
if 'Trýst her' in str(house_build_year):
continue
else:
print(house_build_year.text)
for house_info in soup.find_all('div', class_='house-desc-comp'):
house_info = house_info.text
house_info = str(house_info)
house_info = house_info.replace('Upplýsingar um bústaðin','')
house_info_list.append(house_info)
#print(house_info)
house_final_info.append(house_info)
house_info_list = []
dict = {'Title': house_titles_list, 'Content': house_final_info, 'Date':current_date, 'Post Type': product, 'Price': house_asking_price_list, 'Regular Price': house_asking_price_list, 'Sale Price':house_asking_price_list, 'Stock Status': 'instock', 'Image URL': lst, 'Image Title': house_titles_list, 'Image Featured': lst}
df = pd.DataFrame(dict)
df.to_csv('test.csv')
print(len(house_titles_list))
print(len(house_asking_price_list))
print(len(lst))
print(len(house_final_info))
To remove the list in the cell for (example) Image URL, before writing to the file, try:
df['Image URL'] = [','.join(map(str, i)) for i in df['Image URL']]
The line above can be copied and Image URL above can be changed to Image Featured to clean up the list in the other column.

Attempting to export parsed data to CSV file with Python and I can't figure out how to export more than one row

I'm fairly new to beautiful soup/Python/Web Scraping and I have been able to scrape data from a site, but I am only able to export the very first row to a csv file ( I want to export all scraped data into the file.)
I am stumped on how to make this code export ALL scraped data into multiple individual rows:
r = requests.get("https://www.infoplease.com/primary-sources/government/presidential-speeches/state-union-addresses")
data = r.content # Content of response
soup = BeautifulSoup(data, "html.parser")
for span in soup.find_all("span", {"class": "article"}):
for link in span.select("a"):
name_and_date = link.text.split('(')
name = name_and_date[0].strip()
date = name_and_date[1].replace(')','').strip()
base_url = "https://www.infoplease.com"
links = link['href']
links = urljoin(base_url, links)
pres_data = {'Name': [name],
'Date': [date],
'Link': [links]
}
df = pd.DataFrame(pres_data, columns= ['Name', 'Date', 'Link'])
df.to_csv (r'C:\Users\ThinkPad\Documents\data_file.csv', index = False, header=True)
print (df)
Any ideas here? I believe I need to loop it through the data parsing and grab each set and push it in.
Am I going about this the right way?
Thanks for any insight
The way it is currently set up, it looks like you are not adding each link as a new entry and instead it is only adding the last link. If you initialize a list and add a dictionary like you have it set up for each iteration of the "links" for loop, you will add each row and not just the last one.
import pandas as pd
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
r = requests.get("https://www.infoplease.com/primary-sources/government/presidential-speeches/state-union-addresses")
data = r.content # Content of response
soup = BeautifulSoup(data, "html.parser")
pres_data = []
for span in soup.find_all("span", {"class": "article"}):
for link in span.select("a"):
name_and_date = link.text.split('(')
name = name_and_date[0].strip()
date = name_and_date[1].replace(')','').strip()
base_url = "https://www.infoplease.com"
links = link['href']
links = urljoin(base_url, links)
this_data = {'Name': name,
'Date': date,
'Link': links
}
pres_data.append(this_data)
df = pd.DataFrame(pres_data, columns= ['Name', 'Date', 'Link'])
df.to_csv (r'C:\Users\ThinkPad\Documents\data_file.csv', index = False, header=True)
print (df)
You don't need to use Pandas here since you are not willing to apply any kind of Data operation there!
Usually try to limit yourself on the builtin libraries in case if the task is shorter.
import requests
from bs4 import BeautifulSoup
import csv
def main(url):
r = requests.get(url)
soup = BeautifulSoup(r.text, 'lxml')
target = [([x.a['href']] + x.a.text[:-1].split(' ('))
for x in soup.select('span.article')]
with open('data.csv', 'w', newline='') as f:
writer = csv.writer(f)
writer.writerow(['Url', 'Name', 'Date'])
writer.writerows(target)
main('https://www.infoplease.com/primary-sources/government/presidential-speeches/state-union-addresses')
Sample of output:

Google Play Crawler results - save to CSV

I made a simple crawler that extracts CSV file with google play packages like com.viber.voip and go to the full link like https://play.google.com/store/apps/details?id=com.viber.voip&hl=en.
Then it crawles the title, publisher, downloads etc and stores into a list.
The problem is, when Im trying to save the results into CSV file it throws me an error if Im exporting using pandas to_csv. Or throws UnicodeError when it finds some unknown characters. I tried to add .encode or .decode but it doesnt help. Can someone assist please?
import bs4 as bs
import urllib.request
import pandas as pd
import csv
def searcher(bundles):
html = urllib.request.urlopen(base_url+bundles+post_url).read()
soup = bs.BeautifulSoup(html, 'html.parser')
title_app = soup.title.get_text()
publisher_name = soup.find('a', {'class':'document-subtitle primary'}).get_text()
category = soup.find('a', {'class':'document-subtitle category'}).get_text()
ratings = soup.find('meta', {'itemprop':'ratingValue'}).get('content')
reviews = soup.find('span', {'class':'reviews-num'}).get_text()
downloads = soup.find('div', {'itemprop':'numDownloads'}).get_text()
updated_last_time = soup.find('div', {'class':'content'}).get_text()
text = (bundles, title_app, publisher_name, category, ratings, reviews, downloads, updated_last_time)
return (text)
def store(crawled_data):
writer = csv.writer(f)
labels = ['bundles', 'title_app', 'publisher_name', 'category', 'ratings', 'reviews', 'downloads', 'updated_last_time']
writer.writerow(labels)
df = pd.DataFrame(crawled_data)
for row in df:
if row != None:
writer.writerow(row)
base_url = 'https://play.google.com/store/apps/details?id='
post_url = '&hl=en'
crawled_data = []
crawled_packages = 0
with open('links.csv', 'r') as f:
df = pd.read_csv(f)
urls = df['URLs']
for bundles in urls:
if bundles != None:
aaa = searcher(bundles)
print(crawled_packages)
crawled_packages += 1
if crawled_data != None:
crawled_data.append(aaa)
store(crawled_data)
You can use to_csv() by specifying an output file to use. Also specify your column names whilst building the dataframe:
import bs4 as bs
import urllib.request
import pandas as pd
import csv
def searcher(bundles):
html = urllib.request.urlopen(base_url+bundles+post_url).read()
soup = bs.BeautifulSoup(html, 'html.parser')
title_app = soup.title.get_text()
publisher_name = soup.find('a', {'class':'document-subtitle primary'}).get_text(strip=True)
category = soup.find('a', {'class':'document-subtitle category'}).get_text(strip=True)
ratings = soup.find('meta', {'itemprop':'ratingValue'}).get('content')
reviews = soup.find('span', {'class':'reviews-num'}).get_text(strip=True)
downloads = soup.find('div', {'itemprop':'numDownloads'}).get_text(strip=True)
updated_last_time = soup.find('div', {'class':'content'}).get_text(strip=True)
return (bundles, title_app, publisher_name, category, ratings, reviews, downloads, updated_last_time)
def store(crawled_data):
labels = ['bundles', 'title_app', 'publisher_name', 'category', 'ratings', 'reviews', 'downloads', 'updated_last_time']
df = pd.DataFrame(crawled_data, columns=labels)
df.to_csv('output.csv', index=False)
base_url = 'https://play.google.com/store/apps/details?id='
post_url = '&hl=en'
crawled_data = []
crawled_packages = 0
with open('links.csv', 'r') as f:
df = pd.read_csv(f)
urls = df['URLs']
for bundles in urls:
if bundles != None:
aaa = searcher(bundles)
print(crawled_packages)
crawled_packages += 1
if crawled_data != None:
crawled_data.append(aaa)
store(crawled_data)
This will give you an output.csv file containing:
bundles,title_app,publisher_name,category,ratings,reviews,downloads,updated_last_time
com.viber.voip,Viber Messenger - Android Apps on Google Play,Viber Media S.à r.l.,Communication,4.336112022399902,"11,016,404","500,000,000 - 1,000,000,000","March 15, 2018"

Parsing data with BeautifulSoup and sorting data with pandas DataFrame to_csv

My goal is to parse data from a website and store those data on a text file, formatted to be opened in Excel.
Here is the code:
from bs4 import BeautifulSoup
import requests
import pprint
import re
import pyperclip
import json
import pandas as pd
import csv
pag = range (2, 126)
out_file = open('bestumbrellasoffi.txt', 'w', encoding='utf-8')
with open('bestumbrellasoffi.txt', 'w', encoding='utf-8') as file:
for x in pag:
# Iterate pages
url = 'https://www.paginegialle.it/ricerca/lidi%20balneari/italia/p-' + str(x) + '?mr=50'
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
# Parse data
for i, j, k, p, z in zip(soup.find_all('span', attrs=
{'itemprop': 'name'}), soup.find_all('span', attrs=
{'itemprop': 'longitude'}), soup.find_all('span', attrs=
{'itemprop': 'latitude'}), soup.find_all('span', attrs = {'class': 'street-
address'}), soup.find_all('div', attrs = {'class': 'tel elementPhone'})):
info = i.text, j.text, k.text, p.text, z.text
# Check if data is good
print(url)
print (info)
# Create dataframe
raw_data = { 'nome': [i], 'longitudine': [j], 'latitudine':
[k], 'indirizzo': [p], 'telefono': [z]}
print(raw_data)
df = pd.DataFrame(raw_data, columns =
['nome', 'longitudine', 'latitudine', 'indirizzo', 'telefono'])
df.to_csv('bestumbrellasoffi.txt')
out_file.close()
There are all those modules because I made many tries.
So the output of
print(info) [is][1]
The output of
print(raw_data) is
This is the code reviewed and perfectly functioning.
from bs4 import BeautifulSoup
import requests
import pprint
import re
import pyperclip
import json
import pandas as pd
import csv
pag = range (2, 126)
with open('bestumbrellasoffia.txt', 'a', encoding='utf-8') as file:
for x in pag:
# Iterate pages
url = 'https://www.paginegialle.it/ricerca/lidi%20balneari/italia/p-' + str(x) + '?mr=50'
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
raw_data = { 'nome': [], 'longitudine': [], 'latitudine': [], 'indirizzo': [], 'telefono': []}
df = pd.DataFrame(raw_data, columns = ['nome', 'longitudine', 'latitudine', 'indirizzo', 'telefono'])
# Parse data
for i, j, k, p, z in zip(soup.find_all('span', attrs = {'itemprop': 'name'}), soup.find_all('span', attrs = {'itemprop': 'longitude'}), soup.find_all('span', attrs = {'itemprop': 'latitude'}), soup.find_all('span', attrs = {'class': 'street-address'}), soup.find_all('div', attrs = {'class': 'tel elementPhone'})):
inno = i.text.lstrip()
ye = inno.rstrip()
info = ye, j.text, k.text, p.text, z.text
# Check if data is good
print(info)
# Create dataframe
raw_data = { 'nome': [i], 'longitudine': [j], 'latitudine': [k], 'indirizzo': [p], 'telefono': [z]}
# Try dataframe
#print(raw_data)
file.write(str(info) + "\n")
You're initialising a new Data Frame every iteration of your for loop.
You should initialise it outside the loop then use append() in your loop to add each row of data.

Categories

Resources