Skip to content
Snippets Groups Projects
Commit e51da8b9 authored by zshan2's avatar zshan2
Browse files

assignment2.0 ver1.0: basic classes added

parent 2bf444c7
Branches master
No related tags found
1 merge request!1<assignment-2.1>
class AuthorPackage:
def __init__(self, name, url, id, rating, rating_count,
review_count, image_url, related_authors, author_books):
self.name = name
self.url = url
self.id = id
self.rating = rating
self.rating_count = rating_count
self.review_count = review_count
self.image_url = image_url
self.related_authors = related_authors
self.author_books = author_books
class BookPackage:
def __init__(self, url, title, id, ISBN, author_url, author, rating,
rating_count, review_count, image_url, similar_books):
self.url = url
self.title = title
self.id = id
self.ISBN = ISBN
self.author_url = author_url
self.author = author
self.rating = rating
self.rating_count = rating_count
self.review_count = review_count
self.image_url = image_url
self.similar_books = similar_books
\ No newline at end of file
import Crawler.Spider as Spider
import Crawler.BookPack as BookPack
import Crawler.AuthorPack as AuthorPack
print('please enter URL of the book')
# url = input()
url = 'https://www.goodreads.com/book/show/3735293-clean-code'
print('URL received, start scraping')
startPage = Spider.Spider(url)
startPage.scrap()
import requests
from bs4 import BeautifulSoup
class Spider:
def __init__(self, url):
self.url = url
self.soup = None
def scrap(self):
header = {'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"}
url = self.url
res = requests.get(url, headers=header)
self.soup = BeautifulSoup(res.text, 'lxml')
seeMoreLink = self.soup.select('body > div.content > div.mainContentContainer > div.mainContent > '
'div.mainContentFloat > div.rightContainer > div[id^=relatedWorks] > div > '
'div.bigBoxBody > div > a')[0].get("href")
print(seeMoreLink)
# print(soup)
# print(soup.title.string)
# print(soup.find_all('a'))
# aTypes = soup.find_all('a')[0]
# print(aTypes)
# for obj in aTypes:
# if obj.
# print(soup.)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment