add freebooks
This commit is contained in:
parent
4ee49d6d4c
commit
3c478216bd
|
@ -3,4 +3,5 @@ kindle.json
|
||||||
__pycache__
|
__pycache__
|
||||||
venv
|
venv
|
||||||
config.py
|
config.py
|
||||||
cache
|
cache
|
||||||
|
page
|
|
@ -0,0 +1,64 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
import time as t
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
from urllib.error import HTTPError
|
||||||
|
|
||||||
|
from amazon.api import AmazonAPI
|
||||||
|
|
||||||
|
import config
|
||||||
|
|
||||||
|
cache_dir = 'cache/'
|
||||||
|
|
||||||
|
|
||||||
|
def write_query_to_db(cache_url, data):
|
||||||
|
if not os.path.exists(cache_dir):
|
||||||
|
os.mkdir(cache_dir)
|
||||||
|
|
||||||
|
file = cache_dir + re.match('.*ItemId=(.*)&Operation', cache_url).group(1) + '.xml'
|
||||||
|
f = open(file, 'wb')
|
||||||
|
f.write(data)
|
||||||
|
|
||||||
|
|
||||||
|
def read_query_from_db(cache_url):
|
||||||
|
file = cache_dir + re.match('.*ItemId=(.*)&Operation', cache_url).group(1) + '.xml'
|
||||||
|
if os.path.exists(file) and t.time() - os.path.getmtime(file) < 100 * 24 * 60 * 60 * 1000:
|
||||||
|
f = open(file, 'rb')
|
||||||
|
return f.read()
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
amazon = AmazonAPI(config.KEY_ID, config.SECRET_KEY, config.TAG,
|
||||||
|
region='CN', MaxQPS=0.9, CacheReader=read_query_from_db, CacheWriter=write_query_to_db)
|
||||||
|
|
||||||
|
|
||||||
|
def lookup(book):
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
product = amazon.lookup(ItemId=book.item_id)
|
||||||
|
|
||||||
|
book.author = product.author
|
||||||
|
book.pages = product.pages
|
||||||
|
book.publisher = product.publisher
|
||||||
|
book.brand = product.brand
|
||||||
|
book.asin = product.asin
|
||||||
|
book.binding = product.binding
|
||||||
|
book.edition = product.edition
|
||||||
|
book.editorial_review = product.editorial_review
|
||||||
|
book.isbn = product.isbn
|
||||||
|
book.large_image_url = product.large_image_url
|
||||||
|
book.region = product.region
|
||||||
|
book.release_date = product.release_date.strftime("%Y-%m-%d")
|
||||||
|
if product.publication_date:
|
||||||
|
book.publication_date = product.publication_date.strftime("%Y-%m-%d")
|
||||||
|
book.sales_rank = product.sales_rank
|
||||||
|
book.medium_image_url = product.medium_image_url
|
||||||
|
book.small_image_url = product.small_image_url
|
||||||
|
if product.languages:
|
||||||
|
book.languages = list(product.languages)
|
||||||
|
print('cached: ' + book.item_id + ' -> ' + book.title)
|
||||||
|
break
|
||||||
|
except HTTPError as e:
|
||||||
|
print(e)
|
||||||
|
t.sleep(3)
|
||||||
|
pass
|
|
@ -11,6 +11,32 @@ class Book:
|
||||||
url = ''
|
url = ''
|
||||||
min_day = ''
|
min_day = ''
|
||||||
|
|
||||||
|
item_id = None
|
||||||
|
pages = None
|
||||||
|
publisher = None
|
||||||
|
brand = None
|
||||||
|
asin = None
|
||||||
|
binding = None
|
||||||
|
edition = None
|
||||||
|
editorial_review = None
|
||||||
|
isbn = None
|
||||||
|
large_image_url = None
|
||||||
|
region = None
|
||||||
|
release_date = None
|
||||||
|
sales_rank = None
|
||||||
|
medium_image_url = None
|
||||||
|
publication_date = None
|
||||||
|
small_image_url = None
|
||||||
|
languages = None
|
||||||
|
|
||||||
def json(self):
|
def json(self):
|
||||||
return json.dumps(self, default=lambda o: o.__dict__, indent=2, ensure_ascii=False, sort_keys=True)
|
return json.dumps(self, default=lambda o: o.__dict__, indent=2, ensure_ascii=False, sort_keys=True)
|
||||||
|
|
||||||
|
def dump(self):
|
||||||
|
return clean_dict(self.__dict__)
|
||||||
|
|
||||||
|
|
||||||
|
def clean_dict(d):
|
||||||
|
if not isinstance(d, dict):
|
||||||
|
return d
|
||||||
|
return dict((k, clean_dict(v)) for k, v in d.items() if v is not None)
|
||||||
|
|
|
@ -0,0 +1,74 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
import io
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
|
from book import Book
|
||||||
|
import config
|
||||||
|
|
||||||
|
cn_url = 'https://www.amazon.cn/s/?rh=n:116087071,n:!116088071,n:116169071,p_36:159125071&page='
|
||||||
|
en_url = 'https://www.amazon.cn/s/?rh=n:116087071,n:!116088071,n:116169071,n:116170071,p_36:159125071&page='
|
||||||
|
base_url = 'https://www.amazon.cn/gp/product/'
|
||||||
|
page_dir = 'page/'
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_free_books(url, page):
|
||||||
|
r = requests.get(url + str(page), headers=config.header)
|
||||||
|
from bs4 import BeautifulSoup, Tag
|
||||||
|
import lxml
|
||||||
|
|
||||||
|
bs = BeautifulSoup(r.text, lxml.__name__)
|
||||||
|
items = bs.find_all('li', attrs={'class': 's-result-item celwidget'})
|
||||||
|
|
||||||
|
kindle = {'books': []}
|
||||||
|
|
||||||
|
for item in items:
|
||||||
|
if isinstance(item, Tag):
|
||||||
|
book = Book()
|
||||||
|
book.title = item.find('h2').text
|
||||||
|
# book.item_id = item.find('span', attrs={'name': re.compile('.*')}).get('name')
|
||||||
|
book.item_id = item.get('data-asin')
|
||||||
|
book.url = base_url + book.item_id
|
||||||
|
book.average = 0
|
||||||
|
book.price = 0
|
||||||
|
book.min = 0
|
||||||
|
score = item.find('span', attrs={'class': 'a-icon-alt'})
|
||||||
|
if score:
|
||||||
|
book.score = re.match('平均(.*) 星', score.text).group(1)
|
||||||
|
|
||||||
|
import amz
|
||||||
|
amz.lookup(book)
|
||||||
|
|
||||||
|
kindle['books'].append(book)
|
||||||
|
|
||||||
|
kindle['count'] = len(kindle['books'])
|
||||||
|
kindle['page'] = page
|
||||||
|
return kindle
|
||||||
|
|
||||||
|
|
||||||
|
def get_free_cn_books(page):
|
||||||
|
kindle = fetch_free_books(cn_url, page)
|
||||||
|
with io.open(page_dir + 'kindle_free_books_cn_' + str(page) + '.json', 'w', encoding='utf-8') as f:
|
||||||
|
f.write(json.dumps(kindle, default=lambda o: o.dump(), indent=2, ensure_ascii=False, sort_keys=True))
|
||||||
|
|
||||||
|
|
||||||
|
def get_free_en_books(page):
|
||||||
|
kindle = fetch_free_books(en_url, page)
|
||||||
|
with io.open(page_dir + 'kindle_free_books_en_' + str(page) + '.json', 'w', encoding='utf-8') as f:
|
||||||
|
f.write(json.dumps(kindle, default=lambda o: o.dump(), indent=2, ensure_ascii=False, sort_keys=True))
|
||||||
|
|
||||||
|
|
||||||
|
def get_free_books():
|
||||||
|
if not os.path.exists(page_dir):
|
||||||
|
os.mkdir(page_dir)
|
||||||
|
|
||||||
|
for page in range(1, 400):
|
||||||
|
get_free_cn_books(page)
|
||||||
|
|
||||||
|
for page in range(1, 400):
|
||||||
|
get_free_en_books(page)
|
||||||
|
|
||||||
|
get_free_books()
|
|
@ -1,46 +1,14 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
import io
|
import io
|
||||||
import json
|
import json
|
||||||
import os
|
|
||||||
import re
|
import re
|
||||||
import time as t
|
|
||||||
from urllib.error import HTTPError
|
|
||||||
|
|
||||||
from amazon.api import AmazonAPI
|
|
||||||
import requests
|
import requests
|
||||||
from bs4 import Tag
|
from bs4 import Tag
|
||||||
|
|
||||||
import config
|
import config
|
||||||
from book import Book
|
from book import Book
|
||||||
|
|
||||||
cache_dir = 'cache/'
|
|
||||||
|
|
||||||
|
|
||||||
def write_query_to_db(cache_url, data):
|
|
||||||
|
|
||||||
if not os.path.exists(cache_dir):
|
|
||||||
os.mkdir(cache_dir)
|
|
||||||
|
|
||||||
file = cache_dir + re.match('.*ItemId=(.*)&Operation', cache_url).group(1) + '.xml'
|
|
||||||
f = open(file, 'wb')
|
|
||||||
f.write(data)
|
|
||||||
|
|
||||||
|
|
||||||
def read_query_from_db(cache_url):
|
|
||||||
file = cache_dir + re.match('.*ItemId=(.*)&Operation', cache_url).group(1) + '.xml'
|
|
||||||
if os.path.exists(file) and os.path.getmtime(file) > t.time() - 20 * 60 * 60 * 1000:
|
|
||||||
f = open(file, 'rb')
|
|
||||||
return f.read()
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
amazon = AmazonAPI(config.AWS_ACCESS_KEY_ID, config.AWS_SECRET_ACCESS_KEY, config.AWS_ASSOCIATE_TAG,
|
|
||||||
region='CN', MaxQPS=0.9, CacheReader=read_query_from_db, CacheWriter=write_query_to_db)
|
|
||||||
|
|
||||||
user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/50.0.2661.75 Safari/537.36'
|
|
||||||
|
|
||||||
header = {'User-Agent': user_agent}
|
|
||||||
|
|
||||||
|
|
||||||
def fetch(url, headers, cookies):
|
def fetch(url, headers, cookies):
|
||||||
r = requests.get(url, headers=headers, cookies=cookies)
|
r = requests.get(url, headers=headers, cookies=cookies)
|
||||||
|
@ -77,33 +45,13 @@ def fetch(url, headers, cookies):
|
||||||
book.score = matches.group(4)
|
book.score = matches.group(4)
|
||||||
book.min = matches.group(5)
|
book.min = matches.group(5)
|
||||||
|
|
||||||
while True:
|
import amz
|
||||||
try:
|
amz.lookup(book)
|
||||||
product = amazon.lookup(ItemId=book.item_id)
|
|
||||||
|
|
||||||
book.author = product.author
|
kindle['books'].append(book)
|
||||||
book.pages = product.pages
|
|
||||||
book.publisher = product.publisher
|
|
||||||
book.brand = product.brand
|
|
||||||
book.asin = product.asin
|
|
||||||
book.binding = product.binding
|
|
||||||
book.edition = product.edition
|
|
||||||
book.editorial_reviews = product.editorial_reviews
|
|
||||||
book.isbn = product.isbn
|
|
||||||
book.large_image_url = product.large_image_url
|
|
||||||
book.region = product.region
|
|
||||||
book.release_date = product.release_date.strftime("%Y-%m-%d")
|
|
||||||
book.sales_rank = product.sales_rank
|
|
||||||
|
|
||||||
kindle['books'].append(book)
|
|
||||||
print('cached: ' + book.item_id + ' -> ' + book.title)
|
|
||||||
break
|
|
||||||
except HTTPError:
|
|
||||||
t.sleep(2)
|
|
||||||
pass
|
|
||||||
|
|
||||||
with io.open('kindle.json', 'w', encoding='utf-8') as f:
|
with io.open('kindle.json', 'w', encoding='utf-8') as f:
|
||||||
f.write(json.dumps(kindle, default=lambda o: o.__dict__, indent=2, ensure_ascii=False, sort_keys=True))
|
f.write(json.dumps(kindle, default=lambda o: o.dump(), indent=2, ensure_ascii=False, sort_keys=True))
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
fetch('http://t.bookdna.cn', header, {})
|
fetch('http://t.bookdna.cn', config.header, {})
|
||||||
|
|
Loading…
Reference in New Issue