问题遇到的现象和发生背景
如何把爬取到的名字,价格,时间等数据配对存入cvs文件(爬取时使用multithreading和selenium webdriver)
代码示例
urlList = [ ........ ]#省略200个url
data = read_csv("C:\\Users\\12987\\desktop\\zipcode\\zc.csv")
# converting column data to list
zipCodeList = data['Zipcode'].tolist()
while(True):
priceArray = []
nameArray = []
zipCodeArray =[]
GMTArray = []
TCIN = []
UPC = []
def ScrapingTarget(url):
wait_imp = 10
CO = webdriver.ChromeOptions()
CO.add_experimental_option('useAutomationExtension', False)
CO.add_argument('--ignore-certificate-errors')
CO.add_argument('--start-maximized')
wd = webdriver.Chrome(r'D:\chromedriver\chromedriver_win32new\chromedriver_win32 (2)\chromedriver.exe',options=CO)
wd.get(url)
wd.implicitly_wait(wait_imp)
for zipcode in zipCodeList:
# click the My Store
myStore = wd.find_element(by=By.XPATH, value="//*[@id='web-store-id-msg-btn']/div[2]/div")
myStore.click()
sleep(0.5)
#input ZipCode
inputZipCode = wd.find_element(by=By.XPATH, value="//*[@id='zip-or-city-state']")
inputZipCode.clear()
inputZipCode.send_keys(zipcode)
#click lookup
clickLoopUP = wd.find_element(by=By.XPATH, value="//*[@id='overlay-1']/div[2]/div[1]/div/div[3]/div[2]/button")
clickLoopUP.click()
sleep(0.5)
#choose Store
store = wd.find_element(by=By.XPATH, value="//*[@id='overlay-1']/div[2]/div[3]/div[2]/div[1]/button")
store.click()
#start scraping
name = wd.find_element(by=By.XPATH, value="//*[@id='pageBodyContainer']/div[1]/div[1]/h1/span").text
nameArray.append(name)
price = wd.find_element(by=By.XPATH, value="//*[@id='pageBodyContainer']/div[1]/div[2]/div[2]/div/div[1]/div[1]/span").text
priceArray.append(price)
currentZipCode = zipcode
zipCodeArray.append(currentZipCode)
tz = pytz.timezone('Europe/London')
GMT = datetime.now(tz).strftime("%Y-%m-%d %H:%M:%S")
GMTArray.append(GMT)
# needed to click onto the "Show more" to get the tcin and upc
xpath = '//*[@id="tabContent-tab-Details"]/div/button'
element_present = EC.presence_of_element_located((By.XPATH, xpath))
WebDriverWait(wd, 5).until(element_present)
showMore = wd.find_element(by=By.XPATH, value=xpath)
sleep(2)
showMore.click()
soup = BeautifulSoup(wd.page_source, 'html.parser')
# gets a list of all elements under "Specifications"
div = soup.find("div", {"class": "styles__StyledCol-sc-ct8kx6-0 iKGdHS h-padding-h-tight"})
list = div.find_all("div")
for a in range(len(list)):
list[a] = list[a].text
# locates the elements in the list
tcin = [v for v in list if v.startswith("TCIN")]
upc = [v for v in list if v.startswith("UPC")]
TCIN.append(tcin)
UPC.append(upc)
#scroll up
#wd.find_element_by_tag_name('body').send_keys(Keys.CONTROL + Keys.HOME)
wd.find_element(by=By.TAG_NAME, value='body').send_keys(Keys.CONTROL + Keys.HOME)
with concurrent.futures.ThreadPoolExecutor(10) as executor:
executor.map(ScrapingTarget, urlList)
data = {'prod-name': nameArray,
'Price': priceArray,
'currentZipCode': zipCodeArray,
"Tcin": TCIN,
"UPC":UPC,
"GMT": GMTArray
}
#df = pd.DataFrame(data, columns= ['prod-name', 'Price','currentZipCode',"Tcin","UPC","GMT"])
df = pd.DataFrame.from_dict(data, orient='index')
df = df.transpose()
df.to_csv(r'C:\Users\12987\PycharmProjects\python\Network\priceingAlgoriCoding\export_Target_dataframe.csv', mode='a', index = False, header=True)
sleep(1800)
运行结果及报错内容
代码会爬取Target网站200个相关产品的名字,价格等信息然后不断输入新的zipcode再爬取新的价格信息。代码可以顺利运行但当我试着把他们存入csv文件后发现产品价格名字等信息不配对(产品6的价格会匹配产品1的名字)。
| 名字| 价格 | 时间 | 产品id |
| 产品1 | 价格2 | 时间1 | 产品3id|
| 产品3 | 价格1 | 时间3 | 产品2id|
| 产品2 | 价格3 | 时间2 | 产品1id |
我怀疑是因为使用multithreading和webdriver时打开网页时加载的速度不一样,导致程序先爬取加载好的页面信息,因而爬取的产品信息输入到array中时顺序出现了错误。
我想要达到的结果
爬取的名字价格等信息在csv表格中能配对
| 名字| 价格 | 时间 | 产品id |
| 产品1 | 价格1 | 时间1 | 产品1 id|
| 产品2 | 价格2 | 时间2 | 产品2 id|
| 产品3 | 价格3 | 时间3| 产品3 id |