move kindle to new repo
This commit is contained in:
parent
fd2cb0ed9f
commit
4817808e5e
|
@ -1,7 +0,0 @@
|
|||
kindle.json
|
||||
.idea
|
||||
__pycache__
|
||||
venv
|
||||
config.py
|
||||
cache
|
||||
page
|
|
@ -1,27 +0,0 @@
|
|||
# Kindle
|
||||
|
||||
## 配置
|
||||
|
||||
参考 `config.py.example` ,修改 `config.py` 文件,填写 `API key`, 请在 [Amazon](https://console.aws.amazon.com/iam/home#security_credential
|
||||
) 获取。
|
||||
|
||||
```shell
|
||||
AWS_ACCESS_KEY_ID = "xxx"
|
||||
AWS_SECRET_ACCESS_KEY = "xxx"
|
||||
AWS_ASSOCIATE_TAG = "xxx"
|
||||
```
|
||||
|
||||
## 运行
|
||||
|
||||
```shell
|
||||
virtualenv -p python3 venv
|
||||
source venv/bin/activate
|
||||
pip install -r requirements.txt -I
|
||||
python kindle.py
|
||||
```
|
||||
|
||||
**crontab**
|
||||
|
||||
```shell
|
||||
5 0 * * * /path/to/kindle/cron.sh >> /var/log/kindle.log 2>&1
|
||||
```
|
|
@ -1,81 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
import time as t
|
||||
import os
|
||||
import re
|
||||
from urllib.error import HTTPError
|
||||
|
||||
from amazon.api import AmazonAPI
|
||||
|
||||
import config
|
||||
from node import Node
|
||||
|
||||
cache_dir = 'cache/'
|
||||
|
||||
|
||||
def write_query_to_db(cache_url, data):
|
||||
if not os.path.exists(cache_dir):
|
||||
os.mkdir(cache_dir)
|
||||
|
||||
file = cache_dir + re.match('.*ItemId=(.*)&Operation', cache_url).group(1) + '.xml'
|
||||
f = open(file, 'wb')
|
||||
f.write(data)
|
||||
|
||||
|
||||
def read_query_from_db(cache_url):
|
||||
file = cache_dir + re.match('.*ItemId=(.*)&Operation', cache_url).group(1) + '.xml'
|
||||
if os.path.exists(file) and t.time() - os.path.getmtime(file) < 100 * 24 * 60 * 60 * 1000:
|
||||
f = open(file, 'rb')
|
||||
return f.read()
|
||||
return None
|
||||
|
||||
|
||||
amazon = AmazonAPI(config.KEY_ID, config.SECRET_KEY, config.TAG,
|
||||
region='CN', MaxQPS=0.9, CacheReader=read_query_from_db, CacheWriter=write_query_to_db)
|
||||
|
||||
|
||||
def lookup(book):
|
||||
while True:
|
||||
try:
|
||||
product = amazon.lookup(ItemId=book.item_id)
|
||||
|
||||
book.author = product.author
|
||||
book.pages = product.pages
|
||||
book.publisher = product.publisher
|
||||
book.brand = product.brand
|
||||
book.asin = product.asin
|
||||
book.binding = product.binding
|
||||
book.edition = product.edition
|
||||
book.editorial_review = product.editorial_review
|
||||
book.isbn = product.isbn
|
||||
book.large_image_url = product.large_image_url
|
||||
book.region = product.region
|
||||
book.release_date = product.release_date.strftime("%Y-%m-%d")
|
||||
if product.publication_date:
|
||||
book.publication_date = product.publication_date.strftime("%Y-%m-%d")
|
||||
book.sales_rank = product.sales_rank
|
||||
book.medium_image_url = product.medium_image_url
|
||||
book.small_image_url = product.small_image_url
|
||||
if product.languages:
|
||||
book.languages = list(product.languages)
|
||||
|
||||
book.nodes = []
|
||||
for browse_node in product.browse_nodes:
|
||||
node = Node()
|
||||
book.nodes.append(node)
|
||||
while True:
|
||||
node.id = browse_node.id
|
||||
node.name = str(browse_node.name)
|
||||
if not browse_node.is_category_root:
|
||||
node.node = Node()
|
||||
node = node.node
|
||||
browse_node = browse_node.ancestor
|
||||
else:
|
||||
node.is_root = True
|
||||
break
|
||||
|
||||
print('cached: ' + book.item_id + ' -> ' + book.title)
|
||||
break
|
||||
except HTTPError as e:
|
||||
print(e)
|
||||
t.sleep(3)
|
||||
pass
|
|
@ -1,43 +0,0 @@
|
|||
import json
|
||||
|
||||
|
||||
class Book:
|
||||
title = ''
|
||||
average = 0
|
||||
price = 0
|
||||
author = ''
|
||||
min = 0
|
||||
score = 0
|
||||
url = ''
|
||||
min_day = ''
|
||||
|
||||
item_id = None
|
||||
pages = None
|
||||
publisher = None
|
||||
brand = None
|
||||
asin = None
|
||||
binding = None
|
||||
edition = None
|
||||
editorial_review = None
|
||||
isbn = None
|
||||
large_image_url = None
|
||||
region = None
|
||||
release_date = None
|
||||
sales_rank = None
|
||||
medium_image_url = None
|
||||
publication_date = None
|
||||
small_image_url = None
|
||||
languages = None
|
||||
nodes = None
|
||||
|
||||
def json(self):
|
||||
return json.dumps(self, default=lambda o: o.__dict__, indent=2, ensure_ascii=False, sort_keys=True)
|
||||
|
||||
def dump(self):
|
||||
return clean_dict(self.__dict__)
|
||||
|
||||
|
||||
def clean_dict(d):
|
||||
if not isinstance(d, dict):
|
||||
return d
|
||||
return dict((k, clean_dict(v)) for k, v in d.items() if v is not None)
|
|
@ -1,6 +0,0 @@
|
|||
user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/50.0.2661.75 Safari/537.36'
|
||||
header = {'User-Agent': user_agent}
|
||||
|
||||
KEY_ID = "xxx"
|
||||
SECRET_KEY = "xxx"
|
||||
TAG = "xxx"
|
|
@ -1,11 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
echo $(date)
|
||||
|
||||
PWD="$(dirname $0)"
|
||||
|
||||
echo "$PWD"
|
||||
|
||||
cd "$PWD" || exit 1
|
||||
|
||||
PYTHONIOENCODING=utf-8:surrogateescape venv/bin/python kindle.py
|
|
@ -1,11 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
echo $(date)
|
||||
|
||||
PWD="$(dirname $0)"
|
||||
|
||||
echo "$PWD"
|
||||
|
||||
cd "$PWD" || exit 1
|
||||
|
||||
PYTHONIOENCODING=utf-8:surrogateescape venv/bin/python free_book.py
|
|
@ -1,74 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
import io
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
|
||||
import requests
|
||||
|
||||
from book import Book
|
||||
import config
|
||||
|
||||
cn_url = 'https://www.amazon.cn/s/?rh=n:116087071,n:!116088071,n:116169071,p_36:159125071&page='
|
||||
en_url = 'https://www.amazon.cn/s/?rh=n:116087071,n:!116088071,n:116169071,n:116170071,p_36:159125071&page='
|
||||
base_url = 'https://www.amazon.cn/gp/product/'
|
||||
page_dir = 'page/'
|
||||
|
||||
|
||||
def fetch_free_books(url, page):
|
||||
r = requests.get(url + str(page), headers=config.header)
|
||||
from bs4 import BeautifulSoup, Tag
|
||||
import lxml
|
||||
|
||||
bs = BeautifulSoup(r.text, lxml.__name__)
|
||||
items = bs.find_all('li', attrs={'class': 's-result-item celwidget'})
|
||||
|
||||
kindle = {'books': []}
|
||||
|
||||
for item in items:
|
||||
if isinstance(item, Tag):
|
||||
book = Book()
|
||||
book.title = item.find('h2').text
|
||||
# book.item_id = item.find('span', attrs={'name': re.compile('.*')}).get('name')
|
||||
book.item_id = item.get('data-asin')
|
||||
book.url = base_url + book.item_id
|
||||
book.average = 0
|
||||
book.price = 0
|
||||
book.min = 0
|
||||
score = item.find('span', attrs={'class': 'a-icon-alt'})
|
||||
if score:
|
||||
book.score = re.match('平均(.*) 星', score.text).group(1)
|
||||
|
||||
import amz
|
||||
amz.lookup(book)
|
||||
|
||||
kindle['books'].append(book)
|
||||
|
||||
kindle['count'] = len(kindle['books'])
|
||||
kindle['page'] = page
|
||||
return kindle
|
||||
|
||||
|
||||
def get_free_cn_books(page):
|
||||
kindle = fetch_free_books(cn_url, page)
|
||||
with io.open(page_dir + 'kindle_free_books_cn_' + str(page) + '.json', 'w', encoding='utf-8') as f:
|
||||
f.write(json.dumps(kindle, default=lambda o: o.dump(), indent=2, ensure_ascii=False, sort_keys=True))
|
||||
|
||||
|
||||
def get_free_en_books(page):
|
||||
kindle = fetch_free_books(en_url, page)
|
||||
with io.open(page_dir + 'kindle_free_books_en_' + str(page) + '.json', 'w', encoding='utf-8') as f:
|
||||
f.write(json.dumps(kindle, default=lambda o: o.dump(), indent=2, ensure_ascii=False, sort_keys=True))
|
||||
|
||||
|
||||
def get_free_books():
|
||||
if not os.path.exists(page_dir):
|
||||
os.mkdir(page_dir)
|
||||
|
||||
for page in range(1, 400):
|
||||
get_free_cn_books(page)
|
||||
|
||||
for page in range(1, 400):
|
||||
get_free_en_books(page)
|
||||
|
||||
get_free_books()
|
|
@ -1,57 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
import io
|
||||
import json
|
||||
import re
|
||||
|
||||
import requests
|
||||
from bs4 import Tag
|
||||
|
||||
import config
|
||||
from book import Book
|
||||
|
||||
|
||||
def fetch(url, headers, cookies):
|
||||
r = requests.get(url, headers=headers, cookies=cookies)
|
||||
from bs4 import BeautifulSoup
|
||||
import lxml
|
||||
|
||||
bs = BeautifulSoup(r.text, lxml.__name__)
|
||||
|
||||
time = re.match('数据更新于:(.*)', bs.find('span', style='color:#FFF9A8').text).group(1)
|
||||
|
||||
kindle = {'time': time, 'books': []}
|
||||
|
||||
book_items = bs.find_all('div', style='margin-bottom: 0.9em;')
|
||||
|
||||
for book_item in book_items:
|
||||
|
||||
book = Book()
|
||||
|
||||
if isinstance(book_item, Tag):
|
||||
a = book_item.find('a')
|
||||
min_day = book_item.find('span', title=re.compile('最近在')).get('title')
|
||||
book.min_day = re.match('最近在(.*)达到最低价', min_day).group(1)
|
||||
|
||||
if isinstance(a, Tag):
|
||||
book.url = 'https' + re.match('http(.*)/ref', a.get('href')).group(1)
|
||||
book.item_id = re.match('.*product/(.*)/ref', a.get('href')).group(1)
|
||||
book.title = a.get('title')
|
||||
|
||||
matches = re.match('.*历史均价:¥(.*),现价:¥(.*)作者:(.*),评分:(.*),历史最低价:¥(.*)', book_item.text)
|
||||
|
||||
book.average = matches.group(1)
|
||||
book.price = matches.group(2)
|
||||
book.author = matches.group(3)
|
||||
book.score = matches.group(4)
|
||||
book.min = matches.group(5)
|
||||
|
||||
import amz
|
||||
amz.lookup(book)
|
||||
|
||||
kindle['books'].append(book)
|
||||
|
||||
with io.open('kindle.json', 'w', encoding='utf-8') as f:
|
||||
f.write(json.dumps(kindle, default=lambda o: o.dump(), indent=2, ensure_ascii=False, sort_keys=True))
|
||||
|
||||
if __name__ == '__main__':
|
||||
fetch('http://t.bookdna.cn', config.header, {})
|
|
@ -1,17 +0,0 @@
|
|||
|
||||
class Node:
|
||||
node = None # ancestor
|
||||
id = None
|
||||
is_root = False
|
||||
name = None
|
||||
|
||||
def dump(self):
|
||||
return clean_dict(self.__dict__)
|
||||
|
||||
|
||||
def clean_dict(d):
|
||||
if not isinstance(d, dict):
|
||||
return d
|
||||
return dict((k, clean_dict(v)) for k, v in d.items() if v is not None)
|
||||
|
||||
|
|
@ -1,7 +0,0 @@
|
|||
beautifulsoup4==4.5.1
|
||||
bottlenose==1.0.1
|
||||
lxml==3.5.0
|
||||
python-amazon-simple-product-api==2.1.0
|
||||
python-dateutil==2.5.3
|
||||
requests==2.9.1
|
||||
six==1.10.0
|
Loading…
Reference in New Issue