[Web擷取功能] CPATCHA 破解基礎方法
##install##
#install tesseract-ocr
#python -m pip install Pillow
#python -m pip install pytesseract
#python -m pip install selenium
##import##
#from PIL import Image
#from pytesseract import image_to_string
#from selenium import webdriver
##create browser driver and save screenshot##
#browser = webdriver.Ie(driver_path)
#browser = webdriver.Chrome(driver_path)
#browser = webdriver.Firefox(driver_path)
#browser = webdriver.Ie(driver_path)
#browser = webdriver.Chrome(driver_path)
#browser = webdriver.Firefox(driver_path)
#browser.get(url)
#browser.set_window_size(800, 600) # option
driver.save_screenshot(screenshot_filepath)
##get CAPTCHA image element##
#element = browser.find_element_by_xpath('//*[@id="form1"]/img')
location = element.location
size = element.size
##functions##
def get_captcha_text(location, size):
pytesseract.pytesseract.tesseract_cmd = pytesseract_path
img = Image.open(screenshot_filepath)
left = location['x']
top = location['y']
right = location['x'] + size['width']
bottom = location['y'] + size['height']
img = img.crop((left, top, right, bottom))
return image_to_string(img)
##reference##
https://github.com/VineetChaurasiya/scraping_scripts/blob/master/login_captcha_bypass.py
留言
張貼留言