add freebooks

2016-09-28 00:18:48 +08:00 · 2016-09-28 00:18:48 +08:00 · 3c478216bd
parent 4ee49d6d4c
commit 3c478216bd
5 changed files with 171 additions and 58 deletions
--- a/kindle/.gitignore
+++ b/kindle/.gitignore
@ -3,4 +3,5 @@ kindle.json
 __pycache__
 venv
 config.py
-cache
+cache
 page
--- a/kindle/amz.py
+++ b/kindle/amz.py
@ -0,0 +1,64 @@
 #!/usr/bin/env python3
 import time as t
 import os
 import re
 from urllib.error import HTTPError
 from amazon.api import AmazonAPI
 import config
 cache_dir = 'cache/'
 def write_query_to_db(cache_url, data):
    if not os.path.exists(cache_dir):
        os.mkdir(cache_dir)
    file = cache_dir + re.match('.*ItemId=(.*)&Operation', cache_url).group(1) + '.xml'
    f = open(file, 'wb')
    f.write(data)
 def read_query_from_db(cache_url):
    file = cache_dir + re.match('.*ItemId=(.*)&Operation', cache_url).group(1) + '.xml'
    if os.path.exists(file) and t.time() - os.path.getmtime(file) < 100 * 24 * 60 * 60 * 1000:
        f = open(file, 'rb')
        return f.read()
    return None
 amazon = AmazonAPI(config.KEY_ID, config.SECRET_KEY, config.TAG,
                   region='CN', MaxQPS=0.9, CacheReader=read_query_from_db, CacheWriter=write_query_to_db)
 def lookup(book):
    while True:
        try:
            product = amazon.lookup(ItemId=book.item_id)
            book.author = product.author
            book.pages = product.pages
            book.publisher = product.publisher
            book.brand = product.brand
            book.asin = product.asin
            book.binding = product.binding
            book.edition = product.edition
            book.editorial_review = product.editorial_review
            book.isbn = product.isbn
            book.large_image_url = product.large_image_url
            book.region = product.region
            book.release_date = product.release_date.strftime("%Y-%m-%d")
            if product.publication_date:
                book.publication_date = product.publication_date.strftime("%Y-%m-%d")
            book.sales_rank = product.sales_rank
            book.medium_image_url = product.medium_image_url
            book.small_image_url = product.small_image_url
            if product.languages:
                book.languages = list(product.languages)
            print('cached: ' + book.item_id + ' -> ' + book.title)
            break
        except HTTPError as e:
            print(e)
            t.sleep(3)
            pass
--- a/kindle/book.py
+++ b/kindle/book.py
@ -11,6 +11,32 @@ class Book:
    url = ''
    min_day = ''
    item_id = None
    pages = None
    publisher = None
    brand = None
    asin = None
    binding = None
    edition = None
    editorial_review = None
    isbn = None
    large_image_url = None
    region = None
    release_date = None
    sales_rank = None
    medium_image_url = None
    publication_date = None
    small_image_url = None
    languages = None
    def json(self):
        return json.dumps(self, default=lambda o: o.__dict__, indent=2, ensure_ascii=False, sort_keys=True)
    def dump(self):
        return clean_dict(self.__dict__)
 def clean_dict(d):
    if not isinstance(d, dict):
        return d
    return dict((k, clean_dict(v)) for k, v in d.items() if v is not None)
--- a/kindle/free_book.py
+++ b/kindle/free_book.py
@ -0,0 +1,74 @@
 #!/usr/bin/env python3
 import io
 import json
 import os
 import re
 import requests
 from book import Book
 import config
 cn_url = 'https://www.amazon.cn/s/?rh=n:116087071,n:!116088071,n:116169071,p_36:159125071&page='
 en_url = 'https://www.amazon.cn/s/?rh=n:116087071,n:!116088071,n:116169071,n:116170071,p_36:159125071&page='
 base_url = 'https://www.amazon.cn/gp/product/'
 page_dir = 'page/'
 def fetch_free_books(url, page):
    r = requests.get(url + str(page), headers=config.header)
    from bs4 import BeautifulSoup, Tag
    import lxml
    bs = BeautifulSoup(r.text, lxml.__name__)
    items = bs.find_all('li', attrs={'class': 's-result-item celwidget'})
    kindle = {'books': []}
    for item in items:
        if isinstance(item, Tag):
            book = Book()
            book.title = item.find('h2').text
            # book.item_id = item.find('span', attrs={'name': re.compile('.*')}).get('name')
            book.item_id = item.get('data-asin')
            book.url = base_url + book.item_id
            book.average = 0
            book.price = 0
            book.min = 0
            score = item.find('span', attrs={'class': 'a-icon-alt'})
            if score:
                book.score = re.match('平均(.*) 星', score.text).group(1)
            import amz
            amz.lookup(book)
            kindle['books'].append(book)
    kindle['count'] = len(kindle['books'])
    kindle['page'] = page
    return kindle
 def get_free_cn_books(page):
    kindle = fetch_free_books(cn_url, page)
    with io.open(page_dir + 'kindle_free_books_cn_' + str(page) + '.json', 'w', encoding='utf-8') as f:
        f.write(json.dumps(kindle, default=lambda o: o.dump(), indent=2, ensure_ascii=False, sort_keys=True))
 def get_free_en_books(page):
    kindle = fetch_free_books(en_url, page)
    with io.open(page_dir + 'kindle_free_books_en_' + str(page) + '.json', 'w', encoding='utf-8') as f:
        f.write(json.dumps(kindle, default=lambda o: o.dump(), indent=2, ensure_ascii=False, sort_keys=True))
 def get_free_books():
    if not os.path.exists(page_dir):
        os.mkdir(page_dir)
    for page in range(1, 400):
        get_free_cn_books(page)
    for page in range(1, 400):
        get_free_en_books(page)
 get_free_books()
--- a/kindle/kindle.py
+++ b/kindle/kindle.py
@ -1,46 +1,14 @@
 #!/usr/bin/env python3
 import io
 import json
 import os
 import re
 import time as t
 from urllib.error import HTTPError
 from amazon.api import AmazonAPI
 import requests
 from bs4 import Tag
 import config
 from book import Book
 cache_dir = 'cache/'
 def write_query_to_db(cache_url, data):
    if not os.path.exists(cache_dir):
        os.mkdir(cache_dir)
    file = cache_dir + re.match('.*ItemId=(.*)&Operation', cache_url).group(1) + '.xml'
    f = open(file, 'wb')
    f.write(data)
 def read_query_from_db(cache_url):
    file = cache_dir + re.match('.*ItemId=(.*)&Operation', cache_url).group(1) + '.xml'
    if os.path.exists(file) and os.path.getmtime(file) > t.time() - 20 * 60 * 60 * 1000:
        f = open(file, 'rb')
        return f.read()
    return None
 amazon = AmazonAPI(config.AWS_ACCESS_KEY_ID, config.AWS_SECRET_ACCESS_KEY, config.AWS_ASSOCIATE_TAG,
                   region='CN', MaxQPS=0.9, CacheReader=read_query_from_db, CacheWriter=write_query_to_db)
 user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/50.0.2661.75 Safari/537.36'
 header = {'User-Agent': user_agent}
 def fetch(url, headers, cookies):
    r = requests.get(url, headers=headers, cookies=cookies)
@ -77,33 +45,13 @@ def fetch(url, headers, cookies):
            book.score = matches.group(4)
            book.min = matches.group(5)
-            while True:
+            import amz
-                try:
+            amz.lookup(book)
                    product = amazon.lookup(ItemId=book.item_id)
-                    book.author = product.author
+            kindle['books'].append(book)
                    book.pages = product.pages
                    book.publisher = product.publisher
                    book.brand = product.brand
                    book.asin = product.asin
                    book.binding = product.binding
                    book.edition = product.edition
                    book.editorial_reviews = product.editorial_reviews
                    book.isbn = product.isbn
                    book.large_image_url = product.large_image_url
                    book.region = product.region
                    book.release_date = product.release_date.strftime("%Y-%m-%d")
                    book.sales_rank = product.sales_rank
                    kindle['books'].append(book)
                    print('cached: ' + book.item_id + ' -> ' + book.title)
                    break
                except HTTPError:
                    t.sleep(2)
                    pass
    with io.open('kindle.json', 'w', encoding='utf-8') as f:
-        f.write(json.dumps(kindle, default=lambda o: o.__dict__, indent=2, ensure_ascii=False, sort_keys=True))
+        f.write(json.dumps(kindle, default=lambda o: o.dump(), indent=2, ensure_ascii=False, sort_keys=True))
 if __name__ == '__main__':
-    fetch('http://t.bookdna.cn', header, {})
+    fetch('http://t.bookdna.cn', config.header, {})