* ๋กœ๊ทธ์ธ์„ ํ•˜๊ณ  ํฌ๋กค๋ง์„ ํ•˜๋Š” ์ด์œ 

  1. ๊ตฌ๊ธ€์˜ ๊ฒฝ์šฐ ๋กœ๊ทธ์ธํ•˜๊ณ  ๋‚˜์˜ค๋Š” ์ด๋ฏธ์ง€์™€ ๋กœ๊ทธ์ธ์„ ํ•˜์ง€ ์•Š๊ณ  ๋‚˜์˜ค๋Š” ์ด๋ฏธ์ง€ ๋ชฉ๋ก์ด ๋‹ค๋ฅผ๋•Œ๊ฐ€ ๋งŽ๋‹ค.
  2. ์„ฑ์ธ์ธ์ฆ์ด ํ•„์š”ํ•œ ์ด๋ฏธ์ง€๋“ค์€ ๋กœ๊ทธ์ธ์„ ํ•ด์•ผ๋งŒ ๊ฐ€์ ธ์˜ฌ์ˆ˜ ์žˆ๋‹ค.

 

* ์‚ฌ์šฉ๋ฒ•

  1. ์ •์ƒ์ ์œผ๋กœ ํฌ๋กค๋ง๋˜๋Š”์ง€ ํ™•์ธ์™„๋ฃŒ [23.06.20]
  2. ๋ชจ๋“ˆ ์„ค์น˜ - pip install undetected_chromedriver selenium
  3. ์ฃผ์„ 1๋ฒˆ์— ์ด๋ฏธ์ง€๋ฅผ ์›ํ•˜๋Š” ๊ฒ€์ƒ‰์–ด ๋ชฉ๋ก ์ž…๋ ฅ
  4. ์ฃผ์„ 2๋ฒˆ์— ํด๋”์ด๋ฆ„ ์ž…๋ ฅ. ์ด๋ฏธ์ง€๋Š” data\google\ ์•„๋ž˜ ์ €์žฅ๋จ
  5. ์ฃผ์„ 3๋ฒˆ์— ์ƒ์„ธ์ด๋ฏธ์ง€์˜ xPath ์ž…๋ ฅ. ๊ตฌ๊ธ€์˜ ๊ฒฝ์šฐ ์ž์ฃผ ๋ฐ”๋€Œ๋Š” ๊ฒƒ ๊ฐ™๋‹ค.
  6. ์ฃผ์„ 4๋ฒˆ์— ๊ตฌ๊ธ€ ID์ž…๋ ฅ
  7. ์ฃผ์„ 5๋ฒˆ์— ๊ตฌ๊ธ€ ๋น„๋ฐ€๋ฒˆํ˜ธ ์ž…๋ ฅ. ์ดํ›„ ์ถ”๊ฐ€๋กœ ์Šค๋งˆํŠธํฐ ์ธ์ฆํ™”๋ฉด์ด ๋œฐ ๊ฒฝ์šฐ์— ๋Œ€๋น„ํ•ด 20์ดˆ๊ฐ„ ๊ธฐ๋‹ค๋ฆฐ๋‹ค.
'''
* ๊ตฌ๊ธ€ ์ด๋ฏธ์ง€ ๊ฐ€์ ธ์˜ค๊ธฐ (23.06.20)
'''

import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import undetected_chromedriver as uc

import urllib
import time, datetime

ITEM_LIST = [ "Keith Thompson", "Zdzislaw Beksinski", "dariusz zawadzki"] # 1๋ฒˆ
FOLDER = 'google' # 2๋ฒˆ
IMG_XPATH = '//*[@id="Sva75c"]/div[2]/div/div[2]/div[2]/div[2]/c-wiz/div/div/div/div[3]/div[1]/a/img[1]' # 3๋ฒˆ
SIGNINURL = 'https://accounts.google.com/signin/v2/identifier?hl=ko&passive=true&continue=https%3A%2F%2Fwww.google.com%2F&ec=GAZAmgQ&flowName=GlifWebSignIn&flowEntry=ServiceLogin'
ID = 'xxxx@gmail.com' # 4๋ฒˆ
PASSWORD = 'xxxx' # 5๋ฒˆ

def main():
  start = check_start() # ์‹œ๊ฐ„ ์ธก์ • ์‹œ์ž‘
  driver = uc.Chrome()# ๊ตฌ๊ธ€๋กœ๊ทธ์ธ์„ ์œ„ํ•œ ๋ชจ๋“ˆ์„ ์ผœ๊ณ 
  driver.get(SIGNINURL)
  googleSignIn(driver)# ๊ตฌ๊ธ€๋กœ๊ทธ์ธํ•˜๊ณ 
  
  for searchItem in ITEM_LIST:
    saveDir = makeFolder(searchItem)
    
    url = makeUrl(searchItem)# ๊ฒ€์ƒ‰ํ•  url ๊ฐ€์ ธ์™€์„œ
    driver.get(url)# ์ด๋ฏธ์ง€ ๊ฒ€์ƒ‰์œผ๋กœ ๊ฐ€์„œ
    maximizeWindow(driver)# ์ฐฝ์ตœ๋Œ€ํ™”
    scrollToEnd(driver)

    forbiddenCount = saveImgs(driver, saveDir, start)# ๋ชจ๋“  ์ƒ์„ธ ์ด๋ฏธ์ง€ src๋“ค์„ ๊ฐ€์ ธ์˜จ๋‹ค
    sec = check_time(start)
    print(f'์‹คํŒจ์ˆ˜{str(forbiddenCount)}, {sec}, {datetime.datetime.now().time()}')
  time.sleep(10)
  driver.quit() 
  
# ๊ตฌ๊ธ€ ๋กœ๊ทธ์ธ
def googleSignIn(driver):
  idBtn = driver.find_element(By.XPATH,'//*[@id="identifierId"]')# id ์ž…๋ ฅ์นธ
  idBtn.send_keys(ID)
  nextBtn = driver.find_element(By.XPATH,'//*[@id="identifierNext"]/div/button')
  nextBtn.click()# ๋‹ค์Œ ๋ฒ„ํŠผ ํด๋ฆญ

  # ์•„๋ž˜ ์ฝ”๋“œ๋Š” ๋น„๋ฐ€๋ฒˆํ˜ธ ์š”์†Œ๊ฐ€ ํ™”๋ฉด์— ๋‚˜ํƒ€๋‚ ๋•Œ๊ฐ€์ง€ 10์ดˆ๊ฐ„ ๊ธฐ๋‹ค๋ฆฌ๋Š” ์ฝ”๋“œ์ด๋‚˜
  # ๋น„๋ฒˆ์˜ ๊ฒฝ์šฐ not interactive elem๋ผ์„œ ์—๋Ÿฌ๊ฐ€ ๋œฌ๋‹ค. ํ•˜์ง€๋งŒ ๋Œ์•„๊ฐ€๋Š” ์ฝ”๋“œ์ด๋‹ˆ ๊ธฐ๋‹ค๋ฆผ์ด ํ•„์š”ํ• ๋•Œ ์“ฐ์ž.
  try:
    passwordBtn = WebDriverWait(driver, timeout=10).until(EC.presence_of_element_located( (By.XPATH,'//*[@id="password"]/div[1]/div/div[1]/input') ))
    time.sleep(4)
    passwordBtn = driver.find_element(By.XPATH,'//*[@id="password"]/div[1]/div/div[1]/input')# ๋น„๋ฐ€๋ฒˆํ˜ธ ์ž…๋ ฅ์นธ
    passwordBtn.send_keys(PASSWORD)
    passwordNextBtn = driver.find_element(By.XPATH,'//*[@id="passwordNext"]/div/button')
    passwordNextBtn.click()# ๋น„๋ฐ€๋ฒˆํ˜ธ ๋‹ค์Œ ๋ฒ„ํŠผ
    print('๊ตฌ๊ธ€ ๋กœ๊ทธ์ธ ์„ฑ๊ณต')
    # driver.implicitly_wait(10)
  except OSError as e:
    print(e)
    
  time.sleep(20)# ํœด๋Œ€ํฐ ๋ณธ์ธ ์ธ์ฆ๋“ฑ์˜ ์‹œ๊ฐ„์ด ์ถฉ๋ถ„ํžˆ ํ•„์š”ํ•˜๋‹ค


# ๊ตฌ๊ธ€ ์ด๋ฏธ์ง€ ๊ฒ€์ƒ‰ url ๋งŒ๋“ค๊ธฐ
def makeUrl(searchItem):
  url = 'https://www.google.com/search'
  params ={# q์™€ tbm์ด ํ•„์ˆ˜
    'q'     : searchItem,
    'tbm'   : 'isch',
  }
  url = url + '?' + urllib.parse.urlencode(params)
  return url


# ํด๋” ์ƒ์„ฑ
def makeFolder(searchItem):
  saveDir = os.path.join(os.getcwd(), 'data', f'{FOLDER}_{searchItem}')
  try:
    if not(os.path.isdir(saveDir)): # ํ•ด๋‹น ํด๋”๊ฐ€ ์—†๋‹ค๋ฉด
      os.makedirs(os.path.join(saveDir)) # ๋งŒ๋“ค์–ด๋ผ
    return saveDir
  except OSError as e:
    print(e+'ํด๋” ์ƒ์„ฑ ์‹คํŒจ')

# ์ฐฝ ์ตœ๋Œ€ํ™”
def maximizeWindow(driver):
  driver.maximize_window()

# ๋ชจ๋“  ์ด๋ฏธ์ง€ ๋ชฉ๋ก์„ ๊ฐ€์ ธ์˜ค๊ธฐ ์œ„ํ•ด ๋ฌดํ•œ ์Šคํฌ๋กค ๋‹ค์šด
def scrollToEnd(driver):
  prev_height = driver.execute_script('return document.body.scrollHeight')
  print(f'prev_height: {prev_height}')
  
  while True:
    time.sleep(1) #๋„ค์ด๋ฒ„๋Š” sleep์—†์ด ์ด๋™ํ•  ๊ฒฝ์šฐ ๋ฌดํ•œ๋กœ๋”ฉ์— ๊ฑธ๋ฆฐ๋‹ค.
    driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
    time.sleep(3)
    
    cur_height = driver.execute_script('return document.body.scrollHeight')
    print(f'cur_height: {cur_height}')
    if cur_height == prev_height:
      print('๋†’์ด๊ฐ€ ๊ฐ™์•„์ง')
      break
    prev_height = cur_height
  # ํŽ˜์ด์ง€๋ฅผ ๋ชจ๋‘ ๋กœ๋”ฉํ•œ ํ›„์—๋Š” ์ตœ์ƒ๋‹จ์œผ๋กœ ๋‹ค์‹œ ์˜ฌ๋ผ๊ฐ€๊ธฐ
  driver.execute_script('window.scrollTo(0, 0)')

# ๋ชจ๋“  ์ด๋ฏธ์ง€๋“ค์„ ์ €์žฅํ•œ๋‹ค
def saveImgs(driver, saveDir, start):
  time.sleep(1)
  forbiddenCount = 0
  imgs = driver.find_elements(By.CSS_SELECTOR, '.rg_i.Q4LuWd')
  img_count = len(imgs)
  print(f'์ „์ฒด ์ด๋ฏธ์ง€์ˆ˜ : {img_count}')
  # ํ•˜๋‚˜์”ฉ ํด๋ฆญํ•ด๊ฐ€๋ฉฐ ์ €์žฅ
  for imgNum, img in enumerate(imgs): # imgNum์— ์ด๋ฏธ์ง€๋ฒˆํ˜ธ๊ฐ€ 0๋ถ€ํ„ฐ ๋“ค์–ด๊ฐ„๋‹ค
    try:
      img.click()
      time.sleep(3)
      
      # ์•„๋ž˜์˜ xPath๋Š” ์ž์ฃผ ๋ฐ”๋€Œ๋Š” ๊ฒƒ ๊ฐ™๋‹ค. ๋‚˜๋จธ์ง€๋Š” ๊ณ ์ •์ธ๊ฑฐ ๊ฐ™์œผ๋‹ˆ ์ด๊ฒƒ๋งŒ ๊ฐ€๋” ํ™•์ธํ•ด์ฃผ์ž
      bigImg = driver.find_element(By.XPATH, IMG_XPATH)
      src = bigImg.get_attribute('src')
      urllib.request.urlretrieve(src, saveDir + '/' + str(imgNum) + '.jpg')
      sec = check_time(start)
      print(f'{imgNum+1}/{img_count} saved {sec}')

    except Exception as e:
      print(e)
      forbiddenCount += 1# ์ €์žฅ ์‹คํŒจํ•œ ๊ฐœ์ˆ˜. forbidden์ด๋‚˜ ํŒŒ์ผ์—๋Ÿฌ๋„ ๊ฝค ๋งŽ๋‹ค
      continue
  return forbiddenCount


# ์‹œ๊ฐ„ ์ธก์ •
def check_start():
    start_time = time.time()
    print("Start! now.." + str(start_time))
    return start_time
def check_time(start):
    end = time.time()
    during = end - start
    sec = str(datetime.timedelta(seconds=during)).split('.')[0]
    return sec
main()

* ์ƒ์„ธ ์ด๋ฏธ์ง€์˜ xPath ์•Œ์•„๋‚ด๋Š” ๋ฐฉ๋ฒ•

- ํฌ๋กฌ์˜ ์ด๋ฏธ์ง€ ํด๋ฆญ ํ›„ ๋œจ๋Š” ์ƒ์„ธ์ด๋ฏธ์ง€ ํ™”๋ฉด์—์„œ ํ•ด๋‹น elements์˜ xPath๋ฅผ ์•„๋ž˜์™€ ๊ฐ™์ด ๋ณต์‚ฌํ•จ

 

๋‹ค์Œ์—๋Š” headless ํฌ๋กค๋ง์— ๋Œ€ํ•ด ์ •๋ฆฌํ•˜๊ฒ ๋‹ค

- headless ํฌ๋กค๋ง์€ ํ™”๋ฉด์— ๋ธŒ๋ผ์šฐ์ € ์ฐฝ์„ ๋„์šฐ์ง€ ์•Š๊ณ  ๋ฉ”๋ชจ๋ฆฌ์—์„œ๋งŒ ์ž‘๋™ํ•˜๋Š” ๋ฐฉ์‹์ด๋‹ค.

+ Recent posts