add freebooks

This commit is contained in:
tianyu 2016-09-28 00:18:48 +08:00
parent 4ee49d6d4c
commit 3c478216bd
5 changed files with 171 additions and 58 deletions

3
kindle/.gitignore vendored
View File

@ -3,4 +3,5 @@ kindle.json
__pycache__ __pycache__
venv venv
config.py config.py
cache cache
page

64
kindle/amz.py Normal file
View File

@ -0,0 +1,64 @@
#!/usr/bin/env python3
import time as t
import os
import re
from urllib.error import HTTPError
from amazon.api import AmazonAPI
import config
cache_dir = 'cache/'
def write_query_to_db(cache_url, data):
if not os.path.exists(cache_dir):
os.mkdir(cache_dir)
file = cache_dir + re.match('.*ItemId=(.*)&Operation', cache_url).group(1) + '.xml'
f = open(file, 'wb')
f.write(data)
def read_query_from_db(cache_url):
file = cache_dir + re.match('.*ItemId=(.*)&Operation', cache_url).group(1) + '.xml'
if os.path.exists(file) and t.time() - os.path.getmtime(file) < 100 * 24 * 60 * 60 * 1000:
f = open(file, 'rb')
return f.read()
return None
amazon = AmazonAPI(config.KEY_ID, config.SECRET_KEY, config.TAG,
region='CN', MaxQPS=0.9, CacheReader=read_query_from_db, CacheWriter=write_query_to_db)
def lookup(book):
while True:
try:
product = amazon.lookup(ItemId=book.item_id)
book.author = product.author
book.pages = product.pages
book.publisher = product.publisher
book.brand = product.brand
book.asin = product.asin
book.binding = product.binding
book.edition = product.edition
book.editorial_review = product.editorial_review
book.isbn = product.isbn
book.large_image_url = product.large_image_url
book.region = product.region
book.release_date = product.release_date.strftime("%Y-%m-%d")
if product.publication_date:
book.publication_date = product.publication_date.strftime("%Y-%m-%d")
book.sales_rank = product.sales_rank
book.medium_image_url = product.medium_image_url
book.small_image_url = product.small_image_url
if product.languages:
book.languages = list(product.languages)
print('cached: ' + book.item_id + ' -> ' + book.title)
break
except HTTPError as e:
print(e)
t.sleep(3)
pass

View File

@ -11,6 +11,32 @@ class Book:
url = '' url = ''
min_day = '' min_day = ''
item_id = None
pages = None
publisher = None
brand = None
asin = None
binding = None
edition = None
editorial_review = None
isbn = None
large_image_url = None
region = None
release_date = None
sales_rank = None
medium_image_url = None
publication_date = None
small_image_url = None
languages = None
def json(self): def json(self):
return json.dumps(self, default=lambda o: o.__dict__, indent=2, ensure_ascii=False, sort_keys=True) return json.dumps(self, default=lambda o: o.__dict__, indent=2, ensure_ascii=False, sort_keys=True)
def dump(self):
return clean_dict(self.__dict__)
def clean_dict(d):
if not isinstance(d, dict):
return d
return dict((k, clean_dict(v)) for k, v in d.items() if v is not None)

74
kindle/free_book.py Normal file
View File

@ -0,0 +1,74 @@
#!/usr/bin/env python3
import io
import json
import os
import re
import requests
from book import Book
import config
cn_url = 'https://www.amazon.cn/s/?rh=n:116087071,n:!116088071,n:116169071,p_36:159125071&page='
en_url = 'https://www.amazon.cn/s/?rh=n:116087071,n:!116088071,n:116169071,n:116170071,p_36:159125071&page='
base_url = 'https://www.amazon.cn/gp/product/'
page_dir = 'page/'
def fetch_free_books(url, page):
r = requests.get(url + str(page), headers=config.header)
from bs4 import BeautifulSoup, Tag
import lxml
bs = BeautifulSoup(r.text, lxml.__name__)
items = bs.find_all('li', attrs={'class': 's-result-item celwidget'})
kindle = {'books': []}
for item in items:
if isinstance(item, Tag):
book = Book()
book.title = item.find('h2').text
# book.item_id = item.find('span', attrs={'name': re.compile('.*')}).get('name')
book.item_id = item.get('data-asin')
book.url = base_url + book.item_id
book.average = 0
book.price = 0
book.min = 0
score = item.find('span', attrs={'class': 'a-icon-alt'})
if score:
book.score = re.match('平均(.*) 星', score.text).group(1)
import amz
amz.lookup(book)
kindle['books'].append(book)
kindle['count'] = len(kindle['books'])
kindle['page'] = page
return kindle
def get_free_cn_books(page):
kindle = fetch_free_books(cn_url, page)
with io.open(page_dir + 'kindle_free_books_cn_' + str(page) + '.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(kindle, default=lambda o: o.dump(), indent=2, ensure_ascii=False, sort_keys=True))
def get_free_en_books(page):
kindle = fetch_free_books(en_url, page)
with io.open(page_dir + 'kindle_free_books_en_' + str(page) + '.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(kindle, default=lambda o: o.dump(), indent=2, ensure_ascii=False, sort_keys=True))
def get_free_books():
if not os.path.exists(page_dir):
os.mkdir(page_dir)
for page in range(1, 400):
get_free_cn_books(page)
for page in range(1, 400):
get_free_en_books(page)
get_free_books()

View File

@ -1,46 +1,14 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import io import io
import json import json
import os
import re import re
import time as t
from urllib.error import HTTPError
from amazon.api import AmazonAPI
import requests import requests
from bs4 import Tag from bs4 import Tag
import config import config
from book import Book from book import Book
cache_dir = 'cache/'
def write_query_to_db(cache_url, data):
if not os.path.exists(cache_dir):
os.mkdir(cache_dir)
file = cache_dir + re.match('.*ItemId=(.*)&Operation', cache_url).group(1) + '.xml'
f = open(file, 'wb')
f.write(data)
def read_query_from_db(cache_url):
file = cache_dir + re.match('.*ItemId=(.*)&Operation', cache_url).group(1) + '.xml'
if os.path.exists(file) and os.path.getmtime(file) > t.time() - 20 * 60 * 60 * 1000:
f = open(file, 'rb')
return f.read()
return None
amazon = AmazonAPI(config.AWS_ACCESS_KEY_ID, config.AWS_SECRET_ACCESS_KEY, config.AWS_ASSOCIATE_TAG,
region='CN', MaxQPS=0.9, CacheReader=read_query_from_db, CacheWriter=write_query_to_db)
user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/50.0.2661.75 Safari/537.36'
header = {'User-Agent': user_agent}
def fetch(url, headers, cookies): def fetch(url, headers, cookies):
r = requests.get(url, headers=headers, cookies=cookies) r = requests.get(url, headers=headers, cookies=cookies)
@ -77,33 +45,13 @@ def fetch(url, headers, cookies):
book.score = matches.group(4) book.score = matches.group(4)
book.min = matches.group(5) book.min = matches.group(5)
while True: import amz
try: amz.lookup(book)
product = amazon.lookup(ItemId=book.item_id)
book.author = product.author kindle['books'].append(book)
book.pages = product.pages
book.publisher = product.publisher
book.brand = product.brand
book.asin = product.asin
book.binding = product.binding
book.edition = product.edition
book.editorial_reviews = product.editorial_reviews
book.isbn = product.isbn
book.large_image_url = product.large_image_url
book.region = product.region
book.release_date = product.release_date.strftime("%Y-%m-%d")
book.sales_rank = product.sales_rank
kindle['books'].append(book)
print('cached: ' + book.item_id + ' -> ' + book.title)
break
except HTTPError:
t.sleep(2)
pass
with io.open('kindle.json', 'w', encoding='utf-8') as f: with io.open('kindle.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(kindle, default=lambda o: o.__dict__, indent=2, ensure_ascii=False, sort_keys=True)) f.write(json.dumps(kindle, default=lambda o: o.dump(), indent=2, ensure_ascii=False, sort_keys=True))
if __name__ == '__main__': if __name__ == '__main__':
fetch('http://t.bookdna.cn', header, {}) fetch('http://t.bookdna.cn', config.header, {})