add freebooks

2025-11-12 06:56:09 +08:00 · 2016-09-28 00:18:48 +08:00
parent 4ee49d6d4c
commit 3c478216bd
5 changed files with 171 additions and 58 deletions
--- a/kindle/.gitignore
+++ b/kindle/.gitignore
@@ -4,3 +4,4 @@ __pycache__
 venv
 config.py
 cache
+page
--- a/kindle/amz.py
+++ b/kindle/amz.py
@@ -0,0 +1,64 @@
+#!/usr/bin/env python3
+import time as t
+import os
+import re
+from urllib.error import HTTPError
+
+from amazon.api import AmazonAPI
+
+import config
+
+cache_dir = 'cache/'
+
+
+def write_query_to_db(cache_url, data):
+    if not os.path.exists(cache_dir):
+        os.mkdir(cache_dir)
+
+    file = cache_dir + re.match('.*ItemId=(.*)&Operation', cache_url).group(1) + '.xml'
+    f = open(file, 'wb')
+    f.write(data)
+
+
+def read_query_from_db(cache_url):
+    file = cache_dir + re.match('.*ItemId=(.*)&Operation', cache_url).group(1) + '.xml'
+    if os.path.exists(file) and t.time() - os.path.getmtime(file) < 100 * 24 * 60 * 60 * 1000:
+        f = open(file, 'rb')
+        return f.read()
+    return None
+
+
+amazon = AmazonAPI(config.KEY_ID, config.SECRET_KEY, config.TAG,
+                   region='CN', MaxQPS=0.9, CacheReader=read_query_from_db, CacheWriter=write_query_to_db)
+
+
+def lookup(book):
+    while True:
+        try:
+            product = amazon.lookup(ItemId=book.item_id)
+
+            book.author = product.author
+            book.pages = product.pages
+            book.publisher = product.publisher
+            book.brand = product.brand
+            book.asin = product.asin
+            book.binding = product.binding
+            book.edition = product.edition
+            book.editorial_review = product.editorial_review
+            book.isbn = product.isbn
+            book.large_image_url = product.large_image_url
+            book.region = product.region
+            book.release_date = product.release_date.strftime("%Y-%m-%d")
+            if product.publication_date:
+                book.publication_date = product.publication_date.strftime("%Y-%m-%d")
+            book.sales_rank = product.sales_rank
+            book.medium_image_url = product.medium_image_url
+            book.small_image_url = product.small_image_url
+            if product.languages:
+                book.languages = list(product.languages)
+            print('cached: ' + book.item_id + ' -> ' + book.title)
+            break
+        except HTTPError as e:
+            print(e)
+            t.sleep(3)
+            pass
--- a/kindle/book.py
+++ b/kindle/book.py
@@ -11,6 +11,32 @@ class Book:
    url = ''
    min_day = ''

+    item_id = None
+    pages = None
+    publisher = None
+    brand = None
+    asin = None
+    binding = None
+    edition = None
+    editorial_review = None
+    isbn = None
+    large_image_url = None
+    region = None
+    release_date = None
+    sales_rank = None
+    medium_image_url = None
+    publication_date = None
+    small_image_url = None
+    languages = None
+
    def json(self):
        return json.dumps(self, default=lambda o: o.__dict__, indent=2, ensure_ascii=False, sort_keys=True)

+    def dump(self):
+        return clean_dict(self.__dict__)
+
+
+def clean_dict(d):
+    if not isinstance(d, dict):
+        return d
+    return dict((k, clean_dict(v)) for k, v in d.items() if v is not None)
--- a/kindle/free_book.py
+++ b/kindle/free_book.py
@@ -0,0 +1,74 @@
+#!/usr/bin/env python3
+import io
+import json
+import os
+import re
+
+import requests
+
+from book import Book
+import config
+
+cn_url = 'https://www.amazon.cn/s/?rh=n:116087071,n:!116088071,n:116169071,p_36:159125071&page='
+en_url = 'https://www.amazon.cn/s/?rh=n:116087071,n:!116088071,n:116169071,n:116170071,p_36:159125071&page='
+base_url = 'https://www.amazon.cn/gp/product/'
+page_dir = 'page/'
+
+
+def fetch_free_books(url, page):
+    r = requests.get(url + str(page), headers=config.header)
+    from bs4 import BeautifulSoup, Tag
+    import lxml
+
+    bs = BeautifulSoup(r.text, lxml.__name__)
+    items = bs.find_all('li', attrs={'class': 's-result-item celwidget'})
+
+    kindle = {'books': []}
+
+    for item in items:
+        if isinstance(item, Tag):
+            book = Book()
+            book.title = item.find('h2').text
+            # book.item_id = item.find('span', attrs={'name': re.compile('.*')}).get('name')
+            book.item_id = item.get('data-asin')
+            book.url = base_url + book.item_id
+            book.average = 0
+            book.price = 0
+            book.min = 0
+            score = item.find('span', attrs={'class': 'a-icon-alt'})
+            if score:
+                book.score = re.match('平均(.*) 星', score.text).group(1)
+
+            import amz
+            amz.lookup(book)
+
+            kindle['books'].append(book)
+
+    kindle['count'] = len(kindle['books'])
+    kindle['page'] = page
+    return kindle
+
+
+def get_free_cn_books(page):
+    kindle = fetch_free_books(cn_url, page)
+    with io.open(page_dir + 'kindle_free_books_cn_' + str(page) + '.json', 'w', encoding='utf-8') as f:
+        f.write(json.dumps(kindle, default=lambda o: o.dump(), indent=2, ensure_ascii=False, sort_keys=True))
+
+
+def get_free_en_books(page):
+    kindle = fetch_free_books(en_url, page)
+    with io.open(page_dir + 'kindle_free_books_en_' + str(page) + '.json', 'w', encoding='utf-8') as f:
+        f.write(json.dumps(kindle, default=lambda o: o.dump(), indent=2, ensure_ascii=False, sort_keys=True))
+
+
+def get_free_books():
+    if not os.path.exists(page_dir):
+        os.mkdir(page_dir)
+
+    for page in range(1, 400):
+        get_free_cn_books(page)
+
+    for page in range(1, 400):
+        get_free_en_books(page)
+
+get_free_books()
--- a/kindle/kindle.py
+++ b/kindle/kindle.py
@@ -1,46 +1,14 @@
 #!/usr/bin/env python3
 import io
 import json
-import os
 import re
-import time as t
-from urllib.error import HTTPError

-from amazon.api import AmazonAPI
 import requests
 from bs4 import Tag

 import config
 from book import Book

-cache_dir = 'cache/'
-
-
-def write_query_to_db(cache_url, data):
-
-    if not os.path.exists(cache_dir):
-        os.mkdir(cache_dir)
-
-    file = cache_dir + re.match('.*ItemId=(.*)&Operation', cache_url).group(1) + '.xml'
-    f = open(file, 'wb')
-    f.write(data)
-
-
-def read_query_from_db(cache_url):
-    file = cache_dir + re.match('.*ItemId=(.*)&Operation', cache_url).group(1) + '.xml'
-    if os.path.exists(file) and os.path.getmtime(file) > t.time() - 20 * 60 * 60 * 1000:
-        f = open(file, 'rb')
-        return f.read()
-    return None
-
-
-amazon = AmazonAPI(config.AWS_ACCESS_KEY_ID, config.AWS_SECRET_ACCESS_KEY, config.AWS_ASSOCIATE_TAG,
-                   region='CN', MaxQPS=0.9, CacheReader=read_query_from_db, CacheWriter=write_query_to_db)
-
-user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/50.0.2661.75 Safari/537.36'
-
-header = {'User-Agent': user_agent}
-

 def fetch(url, headers, cookies):
    r = requests.get(url, headers=headers, cookies=cookies)
@@ -77,33 +45,13 @@ def fetch(url, headers, cookies):
            book.score = matches.group(4)
            book.min = matches.group(5)

-            while True:
-                try:
-                    product = amazon.lookup(ItemId=book.item_id)
-
-                    book.author = product.author
-                    book.pages = product.pages
-                    book.publisher = product.publisher
-                    book.brand = product.brand
-                    book.asin = product.asin
-                    book.binding = product.binding
-                    book.edition = product.edition
-                    book.editorial_reviews = product.editorial_reviews
-                    book.isbn = product.isbn
-                    book.large_image_url = product.large_image_url
-                    book.region = product.region
-                    book.release_date = product.release_date.strftime("%Y-%m-%d")
-                    book.sales_rank = product.sales_rank
+            import amz
+            amz.lookup(book)

            kindle['books'].append(book)
-                    print('cached: ' + book.item_id + ' -> ' + book.title)
-                    break
-                except HTTPError:
-                    t.sleep(2)
-                    pass

    with io.open('kindle.json', 'w', encoding='utf-8') as f:
-        f.write(json.dumps(kindle, default=lambda o: o.__dict__, indent=2, ensure_ascii=False, sort_keys=True))
+        f.write(json.dumps(kindle, default=lambda o: o.dump(), indent=2, ensure_ascii=False, sort_keys=True))

 if __name__ == '__main__':
-    fetch('http://t.bookdna.cn', header, {})
+    fetch('http://t.bookdna.cn', config.header, {})