assignment2.0 ver1.0: basic classes added

e51da8b9 · zshan2 · 2bf444c7 · e51da8b9 · e51da8b9 · e51da8b9
Commit e51da8b9 authored 4 years ago by zshan2
--- a/Crawler/AuthorPack.py
+++ b/Crawler/AuthorPack.py
+
+class AuthorPackage:
+    def __init__(self, name, url, id, rating, rating_count,
+                 review_count, image_url, related_authors, author_books):
+        self.name = name
+        self.url = url
+        self.id = id
+        self.rating = rating
+        self.rating_count = rating_count
+        self.review_count = review_count
+        self.image_url = image_url
+        self.related_authors = related_authors
+        self.author_books = author_books
--- a/Crawler/BookPack.py
+++ b/Crawler/BookPack.py
+
+class BookPackage:
+    def __init__(self, url, title, id, ISBN, author_url, author, rating,
+                 rating_count, review_count, image_url, similar_books):
+        self.url = url
+        self.title = title
+        self.id = id
+        self.ISBN = ISBN
+        self.author_url = author_url
+        self.author = author
+        self.rating = rating
+        self.rating_count = rating_count
+        self.review_count = review_count
+        self.image_url = image_url
+        self.similar_books = similar_books
\ No newline at end of file
--- a/Crawler/Main.py
+++ b/Crawler/Main.py
+import Crawler.Spider as Spider
+import Crawler.BookPack as BookPack
+import Crawler.AuthorPack as AuthorPack
+
+
+print('please enter URL of the book')
+# url = input()
+url = 'https://www.goodreads.com/book/show/3735293-clean-code'
+print('URL received, start scraping')
+
+startPage = Spider.Spider(url)
+startPage.scrap()
--- a/Crawler/Spider.py
+++ b/Crawler/Spider.py
+import requests
+from bs4 import BeautifulSoup
+
+
+class Spider:
+
+    def __init__(self, url):
+        self.url = url
+        self.soup = None
+
+    def scrap(self):
+        header = {'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
+                                "(KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"}
+
+        url = self.url
+        res = requests.get(url, headers=header)
+        self.soup = BeautifulSoup(res.text, 'lxml')
+
+        seeMoreLink = self.soup.select('body > div.content > div.mainContentContainer > div.mainContent > '
+                                       'div.mainContentFloat > div.rightContainer > div[id^=relatedWorks] > div > '
+                                       'div.bigBoxBody > div > a')[0].get("href")
+        print(seeMoreLink)
+
+
+
+# print(soup)
+# print(soup.title.string)
+# print(soup.find_all('a'))
+# aTypes = soup.find_all('a')[0]
+# print(aTypes)
+# for obj in aTypes:
+#     if obj.
+# print(soup.)