import requests
import sys
import re
import pymysql
class product:
type="历史"
name=""
author=""
desciption=""
pic1=""
languages=""
press=""
def getProUrl():
urlList = []
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"}
session = requests.Session()
furl="https://www.amazon.cn/gp/search/ref=sr_adv_b/?search-alias=stripbooks&field-binding_browse-bin=2038564051&sort=relevancerank&page="
for i in range(1,7):
html=""
print(furl+str(i))
html = session.post(furl+str(i)+'&node=658418051',headers = headers)
html.encoding = 'utf-8'
s=html.text.encode('gb2312','ignore').decode('gb2312')
url=r'
'
reg=re.compile(url,re.M)
items = reg.findall(html.text)
for i in range(0,len(items)):
urlList.append(items[i])
urlList=set(urlList)
return urlList
def getProData(url):
pro = product()
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"}
session = requests.Session()
zurl="https://www.amazon.cn/dp/"
html = session.get(zurl+url,headers = headers)
html.encoding = 'utf-8'
s=html.text.encode('gb2312','ignore').decode('gb2312')
pro.pic1=getProPic(html)
pro.name=getProname(html)
pro.author=getProAuthor(html)
pro.desciption=getProDescrip(html)
pro.press=getProPress(html)
pro.languages=getProLanguages(html)
return pro
def getProPic(html):
pic=r'id="imgBlkFront" data-a-dynamic-image="{"(.+?)".*?}"'
reg=re.compile(pic,re.M)
items = reg.findall(html.text)
if len(items)==0:
return ""
else:
return items[0]
def getProname(html):
name=r'
(.+?).*?(.+?).*?(.+?)'
reg=re.compile(author,re.S)
items = reg.findall(html.text)
au=""
for i in range(0,len(items)):
au=au+items[i][0]+items[i][1]
return au
def getProDescrip(html):
Descrip=r'.{0,30}出版社:(.+?)
评论列表(0条)