diff --git a/kindle/.gitignore b/kindle/.gitignore index eeffe73..52158a9 100644 --- a/kindle/.gitignore +++ b/kindle/.gitignore @@ -1,3 +1,6 @@ kindle.json .idea -__pycache__ \ No newline at end of file +__pycache__ +venv +config.py +cache \ No newline at end of file diff --git a/kindle/README.md b/kindle/README.md index 3536bc6..a35a156 100644 --- a/kindle/README.md +++ b/kindle/README.md @@ -1,8 +1,19 @@ # Kindle +## 配置 + +参考 `config.py.example` ,修改 `config.py` 文件,填写 `API key`, 请在 [Amazon](https://console.aws.amazon.com/iam/home#security_credential +) 获取。 + +```shell +AWS_ACCESS_KEY_ID = "xxx" +AWS_SECRET_ACCESS_KEY = "xxx" +AWS_ASSOCIATE_TAG = "xxx" +``` + ## 运行 -``` +```shell virtualenv -p python3 venv source venv/bin/activate pip install -r requirements.txt -I @@ -11,6 +22,6 @@ python kindle.py **crontab** -``` +```shell 5 0 * * * /path/to/kindle/cron.sh >> /var/log/kindle.log 2>&1 ``` diff --git a/kindle/config.py.example b/kindle/config.py.example new file mode 100644 index 0000000..7e0d1cf --- /dev/null +++ b/kindle/config.py.example @@ -0,0 +1,3 @@ +AWS_ACCESS_KEY_ID = "xxx" +AWS_SECRET_ACCESS_KEY = "xxx" +AWS_ASSOCIATE_TAG = "xxx" diff --git a/kindle/kindle.py b/kindle/kindle.py index f99ab6d..4b8bdb6 100755 --- a/kindle/kindle.py +++ b/kindle/kindle.py @@ -1,13 +1,42 @@ #!/usr/bin/env python3 import io import json +import os import re +import time as t +from urllib.error import HTTPError +from amazon.api import AmazonAPI import requests from bs4 import Tag +import config from book import Book +cache_dir = 'cache/' + + +def write_query_to_db(cache_url, data): + + if not os.path.exists(cache_dir): + os.mkdir(cache_dir) + + file = cache_dir + re.match('.*ItemId=(.*)&Operation', cache_url).group(1) + '.xml' + f = open(file, 'wb') + f.write(data) + + +def read_query_from_db(cache_url): + file = cache_dir + re.match('.*ItemId=(.*)&Operation', cache_url).group(1) + '.xml' + if os.path.exists(file) and os.path.getmtime(file) > t.time() - 20 * 60 * 60 * 1000: + f = open(file, 'rb') + return f.read() + return None + + +amazon = AmazonAPI(config.AWS_ACCESS_KEY_ID, config.AWS_SECRET_ACCESS_KEY, config.AWS_ASSOCIATE_TAG, + region='CN', MaxQPS=0.9, CacheReader=read_query_from_db, CacheWriter=write_query_to_db) + user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/50.0.2661.75 Safari/537.36' header = {'User-Agent': user_agent} @@ -37,6 +66,7 @@ def fetch(url, headers, cookies): if isinstance(a, Tag): book.url = 'https' + re.match('http(.*)/ref', a.get('href')).group(1) + book.item_id = re.match('.*product/(.*)/ref', a.get('href')).group(1) book.title = a.get('title') matches = re.match('.*历史均价:¥(.*),现价:¥(.*)作者:(.*),评分:(.*),历史最低价:¥(.*)', book_item.text) @@ -47,7 +77,30 @@ def fetch(url, headers, cookies): book.score = matches.group(4) book.min = matches.group(5) - kindle['books'].append(book) + while True: + try: + product = amazon.lookup(ItemId=book.item_id) + + book.author = product.author + book.pages = product.pages + book.publisher = product.publisher + book.brand = product.brand + book.asin = product.asin + book.binding = product.binding + book.edition = product.edition + book.editorial_reviews = product.editorial_reviews + book.isbn = product.isbn + book.large_image_url = product.large_image_url + book.region = product.region + book.release_date = product.release_date.strftime("%Y-%m-%d") + book.sales_rank = product.sales_rank + + kindle['books'].append(book) + print('cached: ' + book.item_id + ' -> ' + book.title) + break + except HTTPError: + t.sleep(2) + pass with io.open('kindle.json', 'w', encoding='utf-8') as f: f.write(json.dumps(kindle, default=lambda o: o.__dict__, indent=2, ensure_ascii=False, sort_keys=True)) diff --git a/kindle/requirements.txt b/kindle/requirements.txt index 1c47764..1941487 100644 --- a/kindle/requirements.txt +++ b/kindle/requirements.txt @@ -1,3 +1,4 @@ lxml == 3.5.0 requests == 2.9.1 +AmazonAPIWrapper == 0.0.11 beautifulsoup4 == 4.5.1