diff --git a/kindle/.gitignore b/kindle/.gitignore index 52158a9..27c96ab 100644 --- a/kindle/.gitignore +++ b/kindle/.gitignore @@ -3,4 +3,5 @@ kindle.json __pycache__ venv config.py -cache \ No newline at end of file +cache +page \ No newline at end of file diff --git a/kindle/amz.py b/kindle/amz.py new file mode 100644 index 0000000..0225ff3 --- /dev/null +++ b/kindle/amz.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python3 +import time as t +import os +import re +from urllib.error import HTTPError + +from amazon.api import AmazonAPI + +import config + +cache_dir = 'cache/' + + +def write_query_to_db(cache_url, data): + if not os.path.exists(cache_dir): + os.mkdir(cache_dir) + + file = cache_dir + re.match('.*ItemId=(.*)&Operation', cache_url).group(1) + '.xml' + f = open(file, 'wb') + f.write(data) + + +def read_query_from_db(cache_url): + file = cache_dir + re.match('.*ItemId=(.*)&Operation', cache_url).group(1) + '.xml' + if os.path.exists(file) and t.time() - os.path.getmtime(file) < 100 * 24 * 60 * 60 * 1000: + f = open(file, 'rb') + return f.read() + return None + + +amazon = AmazonAPI(config.KEY_ID, config.SECRET_KEY, config.TAG, + region='CN', MaxQPS=0.9, CacheReader=read_query_from_db, CacheWriter=write_query_to_db) + + +def lookup(book): + while True: + try: + product = amazon.lookup(ItemId=book.item_id) + + book.author = product.author + book.pages = product.pages + book.publisher = product.publisher + book.brand = product.brand + book.asin = product.asin + book.binding = product.binding + book.edition = product.edition + book.editorial_review = product.editorial_review + book.isbn = product.isbn + book.large_image_url = product.large_image_url + book.region = product.region + book.release_date = product.release_date.strftime("%Y-%m-%d") + if product.publication_date: + book.publication_date = product.publication_date.strftime("%Y-%m-%d") + book.sales_rank = product.sales_rank + book.medium_image_url = product.medium_image_url + book.small_image_url = product.small_image_url + if product.languages: + book.languages = list(product.languages) + print('cached: ' + book.item_id + ' -> ' + book.title) + break + except HTTPError as e: + print(e) + t.sleep(3) + pass diff --git a/kindle/book.py b/kindle/book.py index 12a33a7..c62ad6e 100644 --- a/kindle/book.py +++ b/kindle/book.py @@ -11,6 +11,32 @@ class Book: url = '' min_day = '' + item_id = None + pages = None + publisher = None + brand = None + asin = None + binding = None + edition = None + editorial_review = None + isbn = None + large_image_url = None + region = None + release_date = None + sales_rank = None + medium_image_url = None + publication_date = None + small_image_url = None + languages = None + def json(self): return json.dumps(self, default=lambda o: o.__dict__, indent=2, ensure_ascii=False, sort_keys=True) + def dump(self): + return clean_dict(self.__dict__) + + +def clean_dict(d): + if not isinstance(d, dict): + return d + return dict((k, clean_dict(v)) for k, v in d.items() if v is not None) diff --git a/kindle/free_book.py b/kindle/free_book.py new file mode 100644 index 0000000..a053329 --- /dev/null +++ b/kindle/free_book.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python3 +import io +import json +import os +import re + +import requests + +from book import Book +import config + +cn_url = 'https://www.amazon.cn/s/?rh=n:116087071,n:!116088071,n:116169071,p_36:159125071&page=' +en_url = 'https://www.amazon.cn/s/?rh=n:116087071,n:!116088071,n:116169071,n:116170071,p_36:159125071&page=' +base_url = 'https://www.amazon.cn/gp/product/' +page_dir = 'page/' + + +def fetch_free_books(url, page): + r = requests.get(url + str(page), headers=config.header) + from bs4 import BeautifulSoup, Tag + import lxml + + bs = BeautifulSoup(r.text, lxml.__name__) + items = bs.find_all('li', attrs={'class': 's-result-item celwidget'}) + + kindle = {'books': []} + + for item in items: + if isinstance(item, Tag): + book = Book() + book.title = item.find('h2').text + # book.item_id = item.find('span', attrs={'name': re.compile('.*')}).get('name') + book.item_id = item.get('data-asin') + book.url = base_url + book.item_id + book.average = 0 + book.price = 0 + book.min = 0 + score = item.find('span', attrs={'class': 'a-icon-alt'}) + if score: + book.score = re.match('平均(.*) 星', score.text).group(1) + + import amz + amz.lookup(book) + + kindle['books'].append(book) + + kindle['count'] = len(kindle['books']) + kindle['page'] = page + return kindle + + +def get_free_cn_books(page): + kindle = fetch_free_books(cn_url, page) + with io.open(page_dir + 'kindle_free_books_cn_' + str(page) + '.json', 'w', encoding='utf-8') as f: + f.write(json.dumps(kindle, default=lambda o: o.dump(), indent=2, ensure_ascii=False, sort_keys=True)) + + +def get_free_en_books(page): + kindle = fetch_free_books(en_url, page) + with io.open(page_dir + 'kindle_free_books_en_' + str(page) + '.json', 'w', encoding='utf-8') as f: + f.write(json.dumps(kindle, default=lambda o: o.dump(), indent=2, ensure_ascii=False, sort_keys=True)) + + +def get_free_books(): + if not os.path.exists(page_dir): + os.mkdir(page_dir) + + for page in range(1, 400): + get_free_cn_books(page) + + for page in range(1, 400): + get_free_en_books(page) + +get_free_books() diff --git a/kindle/kindle.py b/kindle/kindle.py index 4b8bdb6..8f8f38e 100755 --- a/kindle/kindle.py +++ b/kindle/kindle.py @@ -1,46 +1,14 @@ #!/usr/bin/env python3 import io import json -import os import re -import time as t -from urllib.error import HTTPError -from amazon.api import AmazonAPI import requests from bs4 import Tag import config from book import Book -cache_dir = 'cache/' - - -def write_query_to_db(cache_url, data): - - if not os.path.exists(cache_dir): - os.mkdir(cache_dir) - - file = cache_dir + re.match('.*ItemId=(.*)&Operation', cache_url).group(1) + '.xml' - f = open(file, 'wb') - f.write(data) - - -def read_query_from_db(cache_url): - file = cache_dir + re.match('.*ItemId=(.*)&Operation', cache_url).group(1) + '.xml' - if os.path.exists(file) and os.path.getmtime(file) > t.time() - 20 * 60 * 60 * 1000: - f = open(file, 'rb') - return f.read() - return None - - -amazon = AmazonAPI(config.AWS_ACCESS_KEY_ID, config.AWS_SECRET_ACCESS_KEY, config.AWS_ASSOCIATE_TAG, - region='CN', MaxQPS=0.9, CacheReader=read_query_from_db, CacheWriter=write_query_to_db) - -user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/50.0.2661.75 Safari/537.36' - -header = {'User-Agent': user_agent} - def fetch(url, headers, cookies): r = requests.get(url, headers=headers, cookies=cookies) @@ -77,33 +45,13 @@ def fetch(url, headers, cookies): book.score = matches.group(4) book.min = matches.group(5) - while True: - try: - product = amazon.lookup(ItemId=book.item_id) + import amz + amz.lookup(book) - book.author = product.author - book.pages = product.pages - book.publisher = product.publisher - book.brand = product.brand - book.asin = product.asin - book.binding = product.binding - book.edition = product.edition - book.editorial_reviews = product.editorial_reviews - book.isbn = product.isbn - book.large_image_url = product.large_image_url - book.region = product.region - book.release_date = product.release_date.strftime("%Y-%m-%d") - book.sales_rank = product.sales_rank - - kindle['books'].append(book) - print('cached: ' + book.item_id + ' -> ' + book.title) - break - except HTTPError: - t.sleep(2) - pass + kindle['books'].append(book) with io.open('kindle.json', 'w', encoding='utf-8') as f: - f.write(json.dumps(kindle, default=lambda o: o.__dict__, indent=2, ensure_ascii=False, sort_keys=True)) + f.write(json.dumps(kindle, default=lambda o: o.dump(), indent=2, ensure_ascii=False, sort_keys=True)) if __name__ == '__main__': - fetch('http://t.bookdna.cn', header, {}) + fetch('http://t.bookdna.cn', config.header, {})