Data Gathering

1 minute read

Published:

This post covers Data Gathering.

Data Gathering

import pandas as pd
import os
import requests
from requests_html import HTMLSession
from bs4 import BeautifulSoup
df = pd.read_csv("data/goodreads/books.csv", on_bad_lines="skip", nrows=10)
print(df.shape)
display(df.head(1))

isbn13 = df.isbn13.values.tolist()
print(isbn13)
# Requests

isbn = isbn13[0]
url = f"https://isbndb.com/book/{isbn13}"

response = requests.get(url)
print(response)

file = os.path.join("data", "goodreads", "pages", str(isbn)+".html")
with open(file, "wb") as fp:
    fp.write(response.content)
# HTML Session
isbn = isbn13[0]
url = f"https://isbndb.com/book/{isbn}"

session = HTMLSession()
response = session.get(url)
print(response)

file = os.path.join("data", "goodreads", "pages", str(isbn)+".html")
with open(file, "wb") as fp:
    fp.write(response.content)
soup = BeautifulSoup(response.content, "html.parser")

isbn = isbn13[0]
file = os.path.join("data", "goodreads", "pages", str(isbn)+".html")

with open(file, "rb") as fp:
    content = fp.read()
    
soup = BeautifulSoup(content, "html.parser")
print(soup.find("title"))
print(soup.find("title").contents[0][:-len(" (Book 6) | ISBNdb")+1])

table = soup.find("table", { "class" : "table table-hover table-responsive" })
price = table.findAll("td")[-1].contents[0]
print(price)
def download(isbn):
    url = f"https://isbndb.com/book/{isbn}"
    response = requests.get(url)
    print(response)
    file = os.path.join("data", "goodreads", "pages", str(isbn)+".html")

    with open(file, "wb") as fp:
        fp.write(response.content)
        
for isbn in isbn13:
    download(isbn)
def get_price(isbn):
    file = os.path.join("data", "goodreads", "pages", str(isbn)+".html")

    with open(file, "rb") as fp:
        content = fp.read()
    
    soup = BeautifulSoup(content, "html.parser")
    
    table = soup.find("table", { "class" : "table table-hover table-responsive" })
    
    if table:
        price = table.findAll("td")[-1].contents[0]
    else:
        return None
    
    if "USD" in price:
        return float(price.replace("USD $", ""))
    else:
        return None    
price = []
for isbn in isbn13:
    price.append(get_price(isbn))
print(price)

df["price"] = price
display(df.head(3))
df.to_csv("data/goodreads/books_price.csv", index=None)