From 3d880bb1875d2af1787d6da403a66759b1394c74 Mon Sep 17 00:00:00 2001 From: tianyu Date: Sun, 25 Sep 2016 00:52:16 +0800 Subject: [PATCH] add kindle --- Kindle/.gitignore | 3 +++ Kindle/README.md | 13 ++++++++++ Kindle/book.py | 16 ++++++++++++ Kindle/kindle.py | 56 +++++++++++++++++++++++++++++++++++++++++ Kindle/requirements.txt | 3 +++ 5 files changed, 91 insertions(+) create mode 100644 Kindle/.gitignore create mode 100644 Kindle/README.md create mode 100644 Kindle/book.py create mode 100755 Kindle/kindle.py create mode 100644 Kindle/requirements.txt diff --git a/Kindle/.gitignore b/Kindle/.gitignore new file mode 100644 index 0000000..eeffe73 --- /dev/null +++ b/Kindle/.gitignore @@ -0,0 +1,3 @@ +kindle.json +.idea +__pycache__ \ No newline at end of file diff --git a/Kindle/README.md b/Kindle/README.md new file mode 100644 index 0000000..db7b4fd --- /dev/null +++ b/Kindle/README.md @@ -0,0 +1,13 @@ +# Kindle + +## Install dependencies + +```shell +pip3 install -r requirements.txt +``` + +## Run + +```shell +python3 kindle.py +``` \ No newline at end of file diff --git a/Kindle/book.py b/Kindle/book.py new file mode 100644 index 0000000..12a33a7 --- /dev/null +++ b/Kindle/book.py @@ -0,0 +1,16 @@ +import json + + +class Book: + title = '' + average = 0 + price = 0 + author = '' + min = 0 + score = 0 + url = '' + min_day = '' + + def json(self): + return json.dumps(self, default=lambda o: o.__dict__, indent=2, ensure_ascii=False, sort_keys=True) + diff --git a/Kindle/kindle.py b/Kindle/kindle.py new file mode 100755 index 0000000..f99ab6d --- /dev/null +++ b/Kindle/kindle.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python3 +import io +import json +import re + +import requests +from bs4 import Tag + +from book import Book + +user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/50.0.2661.75 Safari/537.36' + +header = {'User-Agent': user_agent} + + +def fetch(url, headers, cookies): + r = requests.get(url, headers=headers, cookies=cookies) + from bs4 import BeautifulSoup + import lxml + + bs = BeautifulSoup(r.text, lxml.__name__) + + time = re.match('数据更新于:(.*)', bs.find('span', style='color:#FFF9A8').text).group(1) + + kindle = {'time': time, 'books': []} + + book_items = bs.find_all('div', style='margin-bottom: 0.9em;') + + for book_item in book_items: + + book = Book() + + if isinstance(book_item, Tag): + a = book_item.find('a') + min_day = book_item.find('span', title=re.compile('最近在')).get('title') + book.min_day = re.match('最近在(.*)达到最低价', min_day).group(1) + + if isinstance(a, Tag): + book.url = 'https' + re.match('http(.*)/ref', a.get('href')).group(1) + book.title = a.get('title') + + matches = re.match('.*历史均价:¥(.*),现价:¥(.*)作者:(.*),评分:(.*),历史最低价:¥(.*)', book_item.text) + + book.average = matches.group(1) + book.price = matches.group(2) + book.author = matches.group(3) + book.score = matches.group(4) + book.min = matches.group(5) + + kindle['books'].append(book) + + with io.open('kindle.json', 'w', encoding='utf-8') as f: + f.write(json.dumps(kindle, default=lambda o: o.__dict__, indent=2, ensure_ascii=False, sort_keys=True)) + +if __name__ == '__main__': + fetch('http://t.bookdna.cn', header, {}) diff --git a/Kindle/requirements.txt b/Kindle/requirements.txt new file mode 100644 index 0000000..1c47764 --- /dev/null +++ b/Kindle/requirements.txt @@ -0,0 +1,3 @@ +lxml == 3.5.0 +requests == 2.9.1 +beautifulsoup4 == 4.5.1