Data Gathering
Published:
This post covers Data Gathering.
Data Gathering
import pandas as pd
import os
import requests
from requests_html import HTMLSession
from bs4 import BeautifulSoup
df = pd.read_csv("data/goodreads/books.csv", on_bad_lines="skip", nrows=10)
print(df.shape)
display(df.head(1))
isbn13 = df.isbn13.values.tolist()
print(isbn13)
# Requests
isbn = isbn13[0]
url = f"https://isbndb.com/book/{isbn13}"
response = requests.get(url)
print(response)
file = os.path.join("data", "goodreads", "pages", str(isbn)+".html")
with open(file, "wb") as fp:
fp.write(response.content)
# HTML Session
isbn = isbn13[0]
url = f"https://isbndb.com/book/{isbn}"
session = HTMLSession()
response = session.get(url)
print(response)
file = os.path.join("data", "goodreads", "pages", str(isbn)+".html")
with open(file, "wb") as fp:
fp.write(response.content)
soup = BeautifulSoup(response.content, "html.parser")
isbn = isbn13[0]
file = os.path.join("data", "goodreads", "pages", str(isbn)+".html")
with open(file, "rb") as fp:
content = fp.read()
soup = BeautifulSoup(content, "html.parser")
print(soup.find("title"))
print(soup.find("title").contents[0][:-len(" (Book 6) | ISBNdb")+1])
table = soup.find("table", { "class" : "table table-hover table-responsive" })
price = table.findAll("td")[-1].contents[0]
print(price)
def download(isbn):
url = f"https://isbndb.com/book/{isbn}"
response = requests.get(url)
print(response)
file = os.path.join("data", "goodreads", "pages", str(isbn)+".html")
with open(file, "wb") as fp:
fp.write(response.content)
for isbn in isbn13:
download(isbn)
def get_price(isbn):
file = os.path.join("data", "goodreads", "pages", str(isbn)+".html")
with open(file, "rb") as fp:
content = fp.read()
soup = BeautifulSoup(content, "html.parser")
table = soup.find("table", { "class" : "table table-hover table-responsive" })
if table:
price = table.findAll("td")[-1].contents[0]
else:
return None
if "USD" in price:
return float(price.replace("USD $", ""))
else:
return None
price = []
for isbn in isbn13:
price.append(get_price(isbn))
print(price)
df["price"] = price
display(df.head(3))
df.to_csv("data/goodreads/books_price.csv", index=None)