So I am using regex to pull data from a webpage. Done.
Now I am trying to insert this data into a .csv file. No problem right?
So I am having trouble pulling my data from the loops I created to insert into the .csv file. It looks like the best way to conquer this is to create a list, and somehow insert the data into the list and write the data into the csv file. But how can I do that with my current setup?
import re
import sqlite3 as lite
import mysql.connector
import urllib.request
from bs4 import BeautifulSoup
import csv
#We're pulling info on socks from e-commerce site Aliexpress
url="https://www.aliexpress.com/premium/socks.html?SearchText=socks<ype=wholesale&d=y&tc=ppc&blanktest=0&initiative_id=SB_20171202125044&origin=y&catId=0&isViewCP=y"
req = urllib.request.urlopen(url)
soup = BeautifulSoup(req, "html.parser")
div = soup.find_all("div", attrs={"class":"item"})
for item in div:
title_pattern = '<img alt="(.*?)\"'
comp = re.compile(title_pattern)
href = re.findall(comp, str(item))
for x in href:
print(x)
price_pattern = 'itemprop="price">(.*?)<'
comp = re.compile(price_pattern)
href = re.findall(comp, str(item))
for x in href:
print(x)
seller_pattern = '<a class="store j-p4plog".*?>(.*?)<'
comp = re.compile(seller_pattern)
href = re.findall(comp, str(item))
for x in href:
print(x)
orders_pattern = '<em title="Total Orders">.*?<'
comp = re.compile(orders_pattern)
href = re.findall(comp, str(item))
for x in href:
print(x[32:-1])
feedback_pattern = '<a class="rate-num j-p4plog".*?>(.*)<'
comp = re.compile(feedback_pattern)
href = re.findall(comp, str(item))
for x in href:
print(x)
# Creation and insertion of CSV file
# csvfile = "aliexpress.csv"
# csv = open(csvfile, "w")
# columnTitleRow = "Title,Price,Seller,Orders,Feedback,Pair"
# csv.write(columnTitleRow)
#
# for stuff in div:
# title =
# price =
# seller =
# orders =
# feedback =
# row = title + "," + price + "," + seller + "," + orders + "," + feedback +
"," + "\n"
# csv.write(row)
I want to be able to print these lists by their row.
It looks like the best way to conquer this is to create a list, and somehow insert the data into the list and write the data into the csv file. But how can I do that with my current setup?
Yes you're right. Replace your print statements with appends to a list:
data = []
for item in div:
title_pattern = '<img alt="(.*?)\"'
comp = re.compile(title_pattern)
href = re.findall(comp, str(item))
for x in href:
data.append(x)
price_pattern = 'itemprop="price">(.*?)<'
comp = re.compile(price_pattern)
href = re.findall(comp, str(item))
for x in href:
data.append(x)
And then later
csv.writerow(data)
From what I remember, csv.write takes a list and not a rendered CSV string anyways. That's the whole point, it takes the raw data and escapes it properly and adds the commas for you.
Edit: As explained in the comment, I misremembered the interface to csv writer. writerow takes a list, not write. Updated.
Related
I´m trying to export some data from a website and I first tried on one single page. I´ve to import text delimited by titles:
['Drug name','General Information','Clinical Results','Side Effects','Mechanism of Action','Literature
References','Additional Information','Approval Date','Date Created','Company Name']
The url is https://www.centerwatch.com/directories/1067-fda-approved-drugs/listing/3092-afinitor-everolimus
The code currently works, it gives me all the data. But when I insert it on the CSV , the information is not delimited as I wish.
As it is one single page, the excel should have ONE row... but it doesn´t
The code:
from bs4 import BeautifulSoup
import requests
import csv
csv_file = open('Drugs.csv','w')
csv_writer = csv.writer(csv_file, delimiter ='+')
csv_writer.writerow(['Drug name','General Information','Clinical Results','Side Effects','Mechanism of Action','Literature References','Additional Information','Approval Date','Date Created','Company Name'])
link = requests.get('https://www.centerwatch.com/directories/1067-fda-approved-drugs/listing/3092-afinitor-everolimus')
aux =[]
soup = BeautifulSoup(link.content, 'lxml')
drugName = soup.find('div', class_='company-navigation').find('h1').text
gralInfo = soup.find('div', class_='body directory-listing-profile__description')
y = 0
for h2 in gralInfo.find_all('h2'):
print (y)
text =''
for sibling in h2.find_next_siblings():
if (sibling.name == 'h2'):
break
else:
text = text + sibling.get_text(separator ='\n') + '\n'
print(text)
aux.append(text)
print()
print()
y = y + 1
auxi = []
for info in soup.find_all('div', class_='contact directory-listing-profile__master-detail'):
print(info.text)
auxi.append(info.text)
csv_writer.writerow([drugName, aux[0], aux[1], aux[2], aux[3], aux[4], aux[5], auxi[0], auxi[1], auxi[2]])
I have extracted four items (one list of strings and three strings) and I want to write all four items in a single row. Am assume it is possible and just need a bit of help.
r_ingredients_wtht_tags = []
for link in r_links:
r = requests.get(link)
# print(r.status_code)
if r.status_code == requests.codes.ok:
soup = BeautifulSoup(r.content, "html.parser")
r_name = soup.find('h1', {'itemprop': 'name'}).text.replace('recipe', '')
r_prep_time = soup.select('li.recipe-meta-tag')[1].text
r_ingrdnts_with_tags = soup.select('span[itemprop="ingredients"]')
for r_ingrdnts in r_ingrdnts_with_tags: # Remove span tags from list items
r_ingredients_wtht_tags.append(r_ingrdnts.text)
# print(r_ingredients_wtht_tags)
# exit()
r_image_src = soup.find('img', {'itemprop': 'image'}).get('src')
r_image_url = 'https://website.com' + r_image_src # dummy website
r_url = link
# Download the recipe image
print('Downloading image %s...' % (r_image_url))
rec_image = requests.get(r_image_url)
rec_image.raise_for_status() # Will raise an exception if above request failed.
# Create image folder to store current recipe image
os.makedirs('recipe' + str(image_fold_count), exist_ok=True)
# Save recipe image
imageFile = open(os.path.join('recipe' + str(image_fold_count), os.path.basename(r_image_url)), 'wb')
for chunk in rec_image.iter_content(100000):
imageFile.write(chunk)
imageFile.close()
# write to csv file. NOTE TO SELF: MOVE THIS TO ITS OWN FUNCTION
fileWriter.writerow(zip(r_name, r_prep_time, r_ingredients_wtht_tags, r_url +"\n"))
image_fold_count += 1 # Increment recipe folder counter
If r_ingredients_wtht_tags is a list and you want all four elements in one row but in separated cells then you can create list in this way
row = [r_name, r_prep_time, r_ingredients_wtht_tags, r_url]
or convert r_ingredients_wtht_tags into string with some separator.
You can use event , and csv will put this into " " automatically.
ingredients = ",".join(r_ingredients_wtht_tags)
row = [r_name, r_prep_time, ingredients, r_url]
If every ingredient has to be in separated cell the you can do
row = [r_name, r_prep_time] + r_ingredients_wtht_tags + [r_url]
And write it in csv without zip() (and without "\n" - csv will add it at the end of row)
fileWriter.writerow(row)
I'm trying to create a text-delimited file containing the data from the "Actions" table on webpages like this one: http://stats.swehockey.se/Game/Events/300978
I would like each line to include the game # (from the end of the URL) and then the text from the line on the table. For example:
300972 | 60:00 | GK Out | OHK | 33. Hudacek, Julius
I haven't been able to get each row to actually separate. I've tried parsing through each row and column, using a list of stripped strings, and searching by different tags, classes, and styles.
Here's what I currently have:
from bs4 import BeautifulSoup
import urllib.request
def createtext():
gamestr = urlstr + "|"
#Find all table lines. Create one pipe-delimited line for each.
aptext = gamestr
for el in soup.find_all('tr'):
playrow = el.find_all('td', 'tdOdd')
for td in playrow:
if(td.find(text=True)) not in ("", None, "\n"):
aptext = aptext + ''.join(td.text) + "|"
aptext = aptext + "\n" + gamestr
#Creates file with Game # as filename and writes the data to the file
currentfile = urlstr + ".txt"
with open(currentfile, "w") as f:
f.write(str(aptext))
#Grabs the HTML file and creates the soup
urlno = 300978
urlstr = str(urlno)
url = ("http://stats.swehockey.se/Game/Events/" + urlstr)
request = urllib.request.Request(url)
response = urllib.request.urlopen(request)
pbpdoc = response.read().decode('utf-8')
soup = BeautifulSoup(pbpdoc)
createtext()
Thanks for any help or guidance!
First of all, you don't have to construct the CSV data manually, Python provides a built-in csv module for that.
Then, since you are up to "actions" only, I'd identify the "actions" table and find the events-only rows. This can be done with the help of a filtering function checking the first cell to not be empty:
import csv
from bs4 import BeautifulSoup
import requests
def only_action_rows(tag):
if tag.name == 'tr':
first_cell = tag.find('td', class_='tdOdd')
return first_cell and first_cell.get_text(strip=True)
event_id = 300978
url = "http://stats.swehockey.se/Game/Events/{event_id}".format(event_id=event_id)
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
actions_table = soup.find("h2", text="Actions").find_parent("table")
data = [[event_id] + [td.get_text(strip=True) for td in row.find_all('td', class_='tdOdd')]
for row in actions_table.find_all(only_action_rows)]
with open("output.csv", "w") as f:
writer = csv.writer(f)
writer.writerows(data)
Note that I'm using requests here.
I've researched everywhere and I am having trouble.....I want to access each individual element and append the values to my empty python lists...and of course after I have the correct data in my python list I am going to append them to a csv file...
from urllib.request import urlopen
from bs4 import BeautifulSoup
import csv
html = urlopen("http://exoweb0.donbest.com/checkHTML/servlet/send_archive_history.servlet?by_casino=1&for_casino=37&league=1&game=0&date=20151029")
soup = BeautifulSoup((html), "html.parser")
trs = soup.findAll("tr")
dates = []
rotations = []
games = []
lines = []
types = []
for tr in trs.children:
print(tr)
#tds = tr.findAll("td") 'Finds all tds in current tr
#print (tds)
#date = tds.contents[0] ' It reference the first element in each td
#rot = tds[2] ' It reference the third element in each td
#game = tds[3] ' It reference the fourth element in each td
#line = tds[4] ' It reference the fith element in each td
#type_l = tds[5] ' It reference the sixth element in each td
#dates.append(date) Trying to append to my empty list
#rotations.append(rot)
#games.append(game)
#lines.append(line)
#types.append(type_l)
I'm trying to extract the data on the crime rate across states from
this webpage, link to web page
http://www.disastercenter.com/crime/uscrime.htm
I am able to get this into text file. But I would like to get the
response in Json format. How can I do this in python.
Here is my code:
import urllib
import re
from bs4 import BeautifulSoup
link = "http://www.disastercenter.com/crime/uscrime.htm"
f = urllib.urlopen(link)
myfile = f.read()
soup = BeautifulSoup(myfile)
soup1=soup.find('table', width="100%")
soup3=str(soup1)
result = re.sub("<.*?>", "", soup3)
print(result)
output=open("output.txt","w")
output.write(result)
output.close()
The following code will get the data from the two tables and output all of it as a json formatted string.
Working Example (Python 2.7.9):
from lxml import html
import requests
import re as regular_expression
import json
page = requests.get("http://www.disastercenter.com/crime/uscrime.htm")
tree = html.fromstring(page.text)
tables = [tree.xpath('//table/tbody/tr[2]/td/center/center/font/table/tbody'),
tree.xpath('//table/tbody/tr[5]/td/center/center/font/table/tbody')]
tabs = []
for table in tables:
tab = []
for row in table:
for col in row:
var = col.text_content()
var = var.strip().replace(" ", "")
var = var.split('\n')
if regular_expression.match('^\d{4}$', var[0].strip()):
tab_row = {}
tab_row["Year"] = var[0].strip()
tab_row["Population"] = var[1].strip()
tab_row["Total"] = var[2].strip()
tab_row["Violent"] = var[3].strip()
tab_row["Property"] = var[4].strip()
tab_row["Murder"] = var[5].strip()
tab_row["Forcible_Rape"] = var[6].strip()
tab_row["Robbery"] = var[7].strip()
tab_row["Aggravated_Assault"] = var[8].strip()
tab_row["Burglary"] = var[9].strip()
tab_row["Larceny_Theft"] = var[10].strip()
tab_row["Vehicle_Theft"] = var[11].strip()
tab.append(tab_row)
tabs.append(tab)
json_data = json.dumps(tabs)
output = open("output.txt", "w")
output.write(json_data)
output.close()
This might be what you want, if you can use the requests and lxml modules. The data structure presented here is very simple, adjust this to your needs.
First, get a response from your requested URL and parse the result into an HTML tree:
import requests
from lxml import etree
import json
response = requests.get("http://www.disastercenter.com/crime/uscrime.htm")
tree = etree.HTML(response.text)
Assuming you want to extract both tables, create this XPath and unpack the results. totals is "Number of Crimes" and rates is "Rate of Crime per 100,000 People":
xpath = './/table[#width="100%"][#style="background-color: rgb(255, 255, 255);"]//tbody'
totals, rates = tree.findall(xpath)
Extract the raw data (td.find('./') means first child item, whatever tag it has) and clean the strings (r'' raw strings are needed for Python 2.x):
raw_data = []
for tbody in totals, rates:
rows = []
for tr in tbody.getchildren():
row = []
for td in tr.getchildren():
child = td.find('./')
if child is not None and child.tag != 'br':
row.append(child.text.strip(r'\xa0').strip(r'\n').strip())
else:
row.append('')
rows.append(row)
raw_data.append(rows)
Zip together the table headers in the first two rows, then delete the redundant rows, seen as the 11th & 12th steps in slice notation:
data = {}
data['tags'] = [tag0 + tag1 for tag0, tag1 in zip(raw_data[0][0], raw_data[0][1])]
for raw in raw_data:
del raw[::12]
del raw[::11]
Store the rest of the raw data and create a JSON file (optional: eliminate whitespace with separators=(',', ':')):
data['totals'], data['rates'] = raw_data[0], raw_data[1]
with open('data.json', 'w') as f:
json.dump(data, f, separators=(',', ':'))