diff --git a/kindle/.gitignore b/kindle/.gitignore deleted file mode 100644 index 27c96ab..0000000 --- a/kindle/.gitignore +++ /dev/null @@ -1,7 +0,0 @@ -kindle.json -.idea -__pycache__ -venv -config.py -cache -page \ No newline at end of file diff --git a/kindle/README.md b/kindle/README.md deleted file mode 100644 index a35a156..0000000 --- a/kindle/README.md +++ /dev/null @@ -1,27 +0,0 @@ -# Kindle - -## 配置 - -参考 `config.py.example` ,修改 `config.py` 文件,填写 `API key`, 请在 [Amazon](https://console.aws.amazon.com/iam/home#security_credential -) 获取。 - -```shell -AWS_ACCESS_KEY_ID = "xxx" -AWS_SECRET_ACCESS_KEY = "xxx" -AWS_ASSOCIATE_TAG = "xxx" -``` - -## 运行 - -```shell -virtualenv -p python3 venv -source venv/bin/activate -pip install -r requirements.txt -I -python kindle.py -``` - -**crontab** - -```shell -5 0 * * * /path/to/kindle/cron.sh >> /var/log/kindle.log 2>&1 -``` diff --git a/kindle/amz.py b/kindle/amz.py deleted file mode 100644 index b5f91c6..0000000 --- a/kindle/amz.py +++ /dev/null @@ -1,81 +0,0 @@ -#!/usr/bin/env python3 -import time as t -import os -import re -from urllib.error import HTTPError - -from amazon.api import AmazonAPI - -import config -from node import Node - -cache_dir = 'cache/' - - -def write_query_to_db(cache_url, data): - if not os.path.exists(cache_dir): - os.mkdir(cache_dir) - - file = cache_dir + re.match('.*ItemId=(.*)&Operation', cache_url).group(1) + '.xml' - f = open(file, 'wb') - f.write(data) - - -def read_query_from_db(cache_url): - file = cache_dir + re.match('.*ItemId=(.*)&Operation', cache_url).group(1) + '.xml' - if os.path.exists(file) and t.time() - os.path.getmtime(file) < 100 * 24 * 60 * 60 * 1000: - f = open(file, 'rb') - return f.read() - return None - - -amazon = AmazonAPI(config.KEY_ID, config.SECRET_KEY, config.TAG, - region='CN', MaxQPS=0.9, CacheReader=read_query_from_db, CacheWriter=write_query_to_db) - - -def lookup(book): - while True: - try: - product = amazon.lookup(ItemId=book.item_id) - - book.author = product.author - book.pages = product.pages - book.publisher = product.publisher - book.brand = product.brand - book.asin = product.asin - book.binding = product.binding - book.edition = product.edition - book.editorial_review = product.editorial_review - book.isbn = product.isbn - book.large_image_url = product.large_image_url - book.region = product.region - book.release_date = product.release_date.strftime("%Y-%m-%d") - if product.publication_date: - book.publication_date = product.publication_date.strftime("%Y-%m-%d") - book.sales_rank = product.sales_rank - book.medium_image_url = product.medium_image_url - book.small_image_url = product.small_image_url - if product.languages: - book.languages = list(product.languages) - - book.nodes = [] - for browse_node in product.browse_nodes: - node = Node() - book.nodes.append(node) - while True: - node.id = browse_node.id - node.name = str(browse_node.name) - if not browse_node.is_category_root: - node.node = Node() - node = node.node - browse_node = browse_node.ancestor - else: - node.is_root = True - break - - print('cached: ' + book.item_id + ' -> ' + book.title) - break - except HTTPError as e: - print(e) - t.sleep(3) - pass diff --git a/kindle/book.py b/kindle/book.py deleted file mode 100644 index 61b9171..0000000 --- a/kindle/book.py +++ /dev/null @@ -1,43 +0,0 @@ -import json - - -class Book: - title = '' - average = 0 - price = 0 - author = '' - min = 0 - score = 0 - url = '' - min_day = '' - - item_id = None - pages = None - publisher = None - brand = None - asin = None - binding = None - edition = None - editorial_review = None - isbn = None - large_image_url = None - region = None - release_date = None - sales_rank = None - medium_image_url = None - publication_date = None - small_image_url = None - languages = None - nodes = None - - def json(self): - return json.dumps(self, default=lambda o: o.__dict__, indent=2, ensure_ascii=False, sort_keys=True) - - def dump(self): - return clean_dict(self.__dict__) - - -def clean_dict(d): - if not isinstance(d, dict): - return d - return dict((k, clean_dict(v)) for k, v in d.items() if v is not None) diff --git a/kindle/config.py.example b/kindle/config.py.example deleted file mode 100644 index e7a0b30..0000000 --- a/kindle/config.py.example +++ /dev/null @@ -1,6 +0,0 @@ -user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/50.0.2661.75 Safari/537.36' -header = {'User-Agent': user_agent} - -KEY_ID = "xxx" -SECRET_KEY = "xxx" -TAG = "xxx" diff --git a/kindle/cron.sh b/kindle/cron.sh deleted file mode 100755 index dc91833..0000000 --- a/kindle/cron.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -echo $(date) - -PWD="$(dirname $0)" - -echo "$PWD" - -cd "$PWD" || exit 1 - -PYTHONIOENCODING=utf-8:surrogateescape venv/bin/python kindle.py diff --git a/kindle/free.sh b/kindle/free.sh deleted file mode 100755 index ca23073..0000000 --- a/kindle/free.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -echo $(date) - -PWD="$(dirname $0)" - -echo "$PWD" - -cd "$PWD" || exit 1 - -PYTHONIOENCODING=utf-8:surrogateescape venv/bin/python free_book.py diff --git a/kindle/free_book.py b/kindle/free_book.py deleted file mode 100644 index a053329..0000000 --- a/kindle/free_book.py +++ /dev/null @@ -1,74 +0,0 @@ -#!/usr/bin/env python3 -import io -import json -import os -import re - -import requests - -from book import Book -import config - -cn_url = 'https://www.amazon.cn/s/?rh=n:116087071,n:!116088071,n:116169071,p_36:159125071&page=' -en_url = 'https://www.amazon.cn/s/?rh=n:116087071,n:!116088071,n:116169071,n:116170071,p_36:159125071&page=' -base_url = 'https://www.amazon.cn/gp/product/' -page_dir = 'page/' - - -def fetch_free_books(url, page): - r = requests.get(url + str(page), headers=config.header) - from bs4 import BeautifulSoup, Tag - import lxml - - bs = BeautifulSoup(r.text, lxml.__name__) - items = bs.find_all('li', attrs={'class': 's-result-item celwidget'}) - - kindle = {'books': []} - - for item in items: - if isinstance(item, Tag): - book = Book() - book.title = item.find('h2').text - # book.item_id = item.find('span', attrs={'name': re.compile('.*')}).get('name') - book.item_id = item.get('data-asin') - book.url = base_url + book.item_id - book.average = 0 - book.price = 0 - book.min = 0 - score = item.find('span', attrs={'class': 'a-icon-alt'}) - if score: - book.score = re.match('平均(.*) 星', score.text).group(1) - - import amz - amz.lookup(book) - - kindle['books'].append(book) - - kindle['count'] = len(kindle['books']) - kindle['page'] = page - return kindle - - -def get_free_cn_books(page): - kindle = fetch_free_books(cn_url, page) - with io.open(page_dir + 'kindle_free_books_cn_' + str(page) + '.json', 'w', encoding='utf-8') as f: - f.write(json.dumps(kindle, default=lambda o: o.dump(), indent=2, ensure_ascii=False, sort_keys=True)) - - -def get_free_en_books(page): - kindle = fetch_free_books(en_url, page) - with io.open(page_dir + 'kindle_free_books_en_' + str(page) + '.json', 'w', encoding='utf-8') as f: - f.write(json.dumps(kindle, default=lambda o: o.dump(), indent=2, ensure_ascii=False, sort_keys=True)) - - -def get_free_books(): - if not os.path.exists(page_dir): - os.mkdir(page_dir) - - for page in range(1, 400): - get_free_cn_books(page) - - for page in range(1, 400): - get_free_en_books(page) - -get_free_books() diff --git a/kindle/kindle.py b/kindle/kindle.py deleted file mode 100755 index 8f8f38e..0000000 --- a/kindle/kindle.py +++ /dev/null @@ -1,57 +0,0 @@ -#!/usr/bin/env python3 -import io -import json -import re - -import requests -from bs4 import Tag - -import config -from book import Book - - -def fetch(url, headers, cookies): - r = requests.get(url, headers=headers, cookies=cookies) - from bs4 import BeautifulSoup - import lxml - - bs = BeautifulSoup(r.text, lxml.__name__) - - time = re.match('数据更新于:(.*)', bs.find('span', style='color:#FFF9A8').text).group(1) - - kindle = {'time': time, 'books': []} - - book_items = bs.find_all('div', style='margin-bottom: 0.9em;') - - for book_item in book_items: - - book = Book() - - if isinstance(book_item, Tag): - a = book_item.find('a') - min_day = book_item.find('span', title=re.compile('最近在')).get('title') - book.min_day = re.match('最近在(.*)达到最低价', min_day).group(1) - - if isinstance(a, Tag): - book.url = 'https' + re.match('http(.*)/ref', a.get('href')).group(1) - book.item_id = re.match('.*product/(.*)/ref', a.get('href')).group(1) - book.title = a.get('title') - - matches = re.match('.*历史均价:¥(.*),现价:¥(.*)作者:(.*),评分:(.*),历史最低价:¥(.*)', book_item.text) - - book.average = matches.group(1) - book.price = matches.group(2) - book.author = matches.group(3) - book.score = matches.group(4) - book.min = matches.group(5) - - import amz - amz.lookup(book) - - kindle['books'].append(book) - - with io.open('kindle.json', 'w', encoding='utf-8') as f: - f.write(json.dumps(kindle, default=lambda o: o.dump(), indent=2, ensure_ascii=False, sort_keys=True)) - -if __name__ == '__main__': - fetch('http://t.bookdna.cn', config.header, {}) diff --git a/kindle/node.py b/kindle/node.py deleted file mode 100644 index 756a0c1..0000000 --- a/kindle/node.py +++ /dev/null @@ -1,17 +0,0 @@ - -class Node: - node = None # ancestor - id = None - is_root = False - name = None - - def dump(self): - return clean_dict(self.__dict__) - - -def clean_dict(d): - if not isinstance(d, dict): - return d - return dict((k, clean_dict(v)) for k, v in d.items() if v is not None) - - diff --git a/kindle/requirements.txt b/kindle/requirements.txt deleted file mode 100644 index 95ff000..0000000 --- a/kindle/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -beautifulsoup4==4.5.1 -bottlenose==1.0.1 -lxml==3.5.0 -python-amazon-simple-product-api==2.1.0 -python-dateutil==2.5.3 -requests==2.9.1 -six==1.10.0