1 爬去数据代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
|
#coding=utf-8 from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC #加载TimeoutException模块,用于进行超时处理 from selenium.common.exceptions import TimeoutException #正则表达式 import re,sys from pyquery import PyQuery as pq from config import * #加载数据库操作模块 import mysqlOp driver = webdriver.Chrome() #使用phantomJs浏览器驱动 #driver=webdriver.PhantomJS() driver.get( "https://www.taobao.com" ) driver.set_window_size( 1400 , 900 ) wait = WebDriverWait(driver, 10 ) def search(): try : input = wait.until(EC.presence_of_element_located(By.CSS_SELECTOR, "#q" )) submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#J_TSearchForm > div.search-button > button" ))) input .clear() input .send_keys( "美食" ) submit.click() #获取第一页的数据 get_goods() except TimeoutException : search() #获取总页码 def get_total(): #查找总页码 total = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.total" ))) return total.text #翻页 def next_page(page): try : input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > input" ))) submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit" ))) input .clear() input .send_keys(page) submit.click() wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > ul > li.item.active > span" ), str (page))) #获取当前页的数据 count = get_goods() except TimeoutException: next_page(page) return count def get_goods(): wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-itemlist .items .item" ))) #mainsrp-itemlist > div > div > div:nth-child(1) > div.item.J_MouserOnverReq.item-ad #mainsrp-itemlist > div > div > div:nth-child(1) html = driver.page_source doc = pq(html) items = doc( "#mainsrp-itemlist .items .item" ).items() count = 0 for item in items: goods = { 'image' :item.find( '.pic .img' ).attr( 'src' ), 'price' :item.find( '.price' ).text(), 'deal' :item.find( '.deal-cnt' ).text()[: - 3 ], 'title' :item.find( '.title' ).text(), 'shop' :item.find( '.shop' ).text(), 'location' :item.find( '.location' ).text() } print (goods) #将数据插入数据库 mysqlOp.mysqlOp(goods) count + = 1 return count def main(): search() total = get_total() #使用正则表达式提取页码 total = int (re. compile (r "(\d+)" ).search(total).group( 1 )) print (total) total_count = 0 for i in range ( 2 ,total + 1 ): count = next_page(i) total_count + = count print (total_count) if __name__ = = "__main__" : main() |
2 存入到mysql中
创建一个mysqlOp.py的文件
1
2
3
4
5
6
7
8
9
|
#coding=utf-8 from pymysql import * def mysqlOp(goods): conn = connect(host = '127.0.0.1' , port = 3306 , user = 'root' , passwd = '1qaz2wsx#EDC' , db = 'taobao_meishi' , charset = 'utf8' ) cursor = conn.cursor() cursor.execute( "insert into goods(image,price,deal,title,shop,location) values(%s,%s,%s,%s,%s,%s)" ,(goods[ 'image' ],goods[ 'price' ],goods[ 'deal' ],goods[ 'title' ],goods[ 'shop' ],goods[ 'location' ])) conn.commit() cursor.close() conn.close()
转自:https://www.cnblogs.com/yinliang-liang/p/9391746.html |