scripts-github-mirror/kindle/kindle.py

#!/usr/bin/env python3
import io
import json
import os
import re
import time as t
from urllib.error import HTTPError

from amazon.api import AmazonAPI
import requests
from bs4 import Tag

import config
from book import Book

cache_dir = 'cache/'


def write_query_to_db(cache_url, data):

    if not os.path.exists(cache_dir):
        os.mkdir(cache_dir)

    file = cache_dir + re.match('.*ItemId=(.*)&Operation', cache_url).group(1) + '.xml'
    f = open(file, 'wb')
    f.write(data)


def read_query_from_db(cache_url):
    file = cache_dir + re.match('.*ItemId=(.*)&Operation', cache_url).group(1) + '.xml'
    if os.path.exists(file) and os.path.getmtime(file) > t.time() - 20 * 60 * 60 * 1000:
        f = open(file, 'rb')
        return f.read()
    return None


amazon = AmazonAPI(config.AWS_ACCESS_KEY_ID, config.AWS_SECRET_ACCESS_KEY, config.AWS_ASSOCIATE_TAG,
                   region='CN', MaxQPS=0.9, CacheReader=read_query_from_db, CacheWriter=write_query_to_db)

user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/50.0.2661.75 Safari/537.36'

header = {'User-Agent': user_agent}


def fetch(url, headers, cookies):
    r = requests.get(url, headers=headers, cookies=cookies)
    from bs4 import BeautifulSoup
    import lxml

    bs = BeautifulSoup(r.text, lxml.__name__)

    time = re.match('数据更新于：(.*)', bs.find('span', style='color:#FFF9A8').text).group(1)

    kindle = {'time': time, 'books': []}

    book_items = bs.find_all('div', style='margin-bottom: 0.9em;')

    for book_item in book_items:

        book = Book()

        if isinstance(book_item, Tag):
            a = book_item.find('a')
            min_day = book_item.find('span', title=re.compile('最近在')).get('title')
            book.min_day = re.match('最近在(.*)达到最低价', min_day).group(1)

            if isinstance(a, Tag):
                book.url = 'https' + re.match('http(.*)/ref', a.get('href')).group(1)
                book.item_id = re.match('.*product/(.*)/ref', a.get('href')).group(1)
                book.title = a.get('title')

            matches = re.match('.*历史均价：￥(.*)，现价：￥(.*)作者：(.*)，评分：(.*)，历史最低价：￥(.*)', book_item.text)

            book.average = matches.group(1)
            book.price = matches.group(2)
            book.author = matches.group(3)
            book.score = matches.group(4)
            book.min = matches.group(5)

            while True:
                try:
                    product = amazon.lookup(ItemId=book.item_id)

                    book.author = product.author
                    book.pages = product.pages
                    book.publisher = product.publisher
                    book.brand = product.brand
                    book.asin = product.asin
                    book.binding = product.binding
                    book.edition = product.edition
                    book.editorial_reviews = product.editorial_reviews
                    book.isbn = product.isbn
                    book.large_image_url = product.large_image_url
                    book.region = product.region
                    book.release_date = product.release_date.strftime("%Y-%m-%d")
                    book.sales_rank = product.sales_rank

                    kindle['books'].append(book)
                    print('cached: ' + book.item_id + ' -> ' + book.title)
                    break
                except HTTPError:
                    t.sleep(2)
                    pass

    with io.open('kindle.json', 'w', encoding='utf-8') as f:
        f.write(json.dumps(kindle, default=lambda o: o.__dict__, indent=2, ensure_ascii=False, sort_keys=True))

if __name__ == '__main__':
    fetch('http://t.bookdna.cn', header, {})