




#tesseract_cmd = tesseracttesseract_cmd = C:/Program Files (x86)/Tesseract-OCR/tesseract.exe


from PIL import Imageimport pytesseractquestion = pytesseract.image_to_string(Image.open(test.jpeg),lang=chi_sim)question = question.replace( ,) #去除空格print(question)



from PIL import Imageimg = img.convert(1)


from PIL import Imagep = Image.open(picname)p_size = p.size #獲得圖片尺寸t = p.crop((0,int(p_size[1])*0.25,p_size[0],int(p_size[1])*0.45)) #截取題目部分的圖片,後兩個數字要比前兩個大

但是修改之後,識別率並沒有明顯的變化,大多數圖片識別出來還是亂碼,在停滯了一段時間之後(主要還是因為學習期末很多事做- -),突然想到修改圖片的背景顏色和字體的顏色,經過多次檢驗,發現黃底黑字的識別率最高,顏色改了之後,大多數的題目都能識別出來了。


from PIL import Imaget2 = t1.convert(RGB) #轉rgb模式 for i in range(0,t2.size[0]): for j in range(0,t2.size[1]): r = t2.getpixel((i,j))[0] g = t2.getpixel((i,j))[1] b = t2.getpixel((i,j))[2] if b>r and b>g and (r,g<100)and (b<210): r=255 g=255 b=154 #背景藍色變黃 elif (r,g,b>=180): b=0 #白色字變黑 g=0 r=0 t2.putpixel((i,j), (r,g,b))



from PIL import Imageimport pytesseractimport requestsfrom bs4 import BeautifulSoup as BSfrom urllib import parseimport datetimeimport osdef open_pic(picname): p = Image.open(picname) p_size = p.size #獲得圖片尺寸 t = p.crop((0,int(p_size[1])*0.25,p_size[0],int(p_size[1])*0.45)) #截取題目部分的圖片,後兩個數字要比前兩個大 t.save(./first_change.png) t_size = t.size #獲得截取後的圖片尺寸 return t_size,p,tdef get_question(picsize,firstpic): new_x = 0 new_y = 0 t = firstpic for i in range(0,picsize[0]): last_pixel = t.getpixel((i,0))[2] for j in range(0,picsize[1]): now_pixel = t.getpixel((i,j))[2] if last_pixel < 190 and now_pixel > 200: new_x = i-50 new_y = j-150 break if new_x: break #找到背景和文字剛剛轉換的像素點 #背景變黃色,字體變黑色 t1 = t.crop((new_x,new_y,new_x+894,new_y+280)) t2 = t1.convert(RGB) #轉rgb模式 for i in range(0,t2.size[0]): for j in range(0,t2.size[1]): r = t2.getpixel((i,j))[0] g = t2.getpixel((i,j))[1] b = t2.getpixel((i,j))[2] if b>r and b>g and (r,g<100)and (b<210): r=255 g=255 b=154 #背景藍色變黃 elif (r,g,b>=180): b=0 #白色字變黑 g=0 r=0 t2.putpixel((i,j), (r,g,b)) t2.save("./second_change.png") question = pytesseract.image_to_string(Image.open(second_change.png),lang=chi_sim) #分析題目 question = question.replace( ,) #去除空格 question = question.replace(
,) #去除換行 print(question) return questiondef get_choice(oldpic): p = oldpic p_size = p.size c = p.crop((250,int(p_size[1])*11/20,850,int(p_size[1])*8/9)) #截取選項部分的圖片,後兩個數字要比前兩個大 c1 = c.crop((0,0,600,691*1/6)) c2 = c.crop((0,160,600,300)) c3 = c.crop((0,360,600,500)) c4 = c.crop((0,550,600,691)) cc = [c1,c2,c3,c4] choices = [] for h in cc: for i in range(0,h.size[0]): for j in range(0,h.size[1]): r = h.getpixel((i,j))[0] g = h.getpixel((i,j))[1] b = h.getpixel((i,j))[2] if b>r and b>g and (r,g<100)and (b<220): r=0 g=0 b=0 #藍色字變黑 elif (r,g,b>=160): b=154 #白色背景變黃 g=255 r=255 h.putpixel((i,j), (r,g,b)) h.save("./ana_choice.png") choice = pytesseract.image_to_string(Image.open("ana_choice.png"), lang=chi_sim) # 分析選項 choice = choice.replace( ,) #解決選項中有英文大寫字母0的識別錯誤 if 0 in choice: choice=choice.replace(0,O) print (choice) choices.append(choice) return choicesdef search_answer(question,choices): ll = [0,10,20] answer = [] for p in ll: b = parse.quote(question.encode(gbk)) #轉gbk url = https://zhidao.baidu.com/search?word= + b + &ie=gbk&site=-1&sites=0&date=0&pn= + str(p) r = requests.get(url) r.encoding = gbk #網址轉gbk編碼 soup = BS(r.text, html.parser) want = soup.find(div, id=wgt-list) wants = want.find_all(dl, class_=dl) for i in wants: ans = i.find(dd, class_=dd answer).text answer.append(ans) choiceset = {} choiceset[A] = choices[0] choiceset[B] = choices[1] choiceset[C] = choices[2] choiceset[D] = choices[3] for i in choiceset: account = [] for j in answer: if choiceset[i] in j: account.append(j) a = 0 for k in account: a += 1 print( + i + 的可能性是 + str(%.2f % (a * 100 / 30)) + %)def main(filename): picsize = open_pic(filename)[0] oldpic = open_pic(filename)[1] firstpic = open_pic(filename)[2] question = get_question(picsize,firstpic) choices = get_choice(oldpic) search_answer(question,choices)if __name__ == __main__: start = datetime.datetime.now() your = input(準備好了按y) if your == y: os.system(adb shell screencap -p /sdcard/auto.png) os.system(adb pull /sdcard/auto.png) img = Image.open(auto.png) img.convert(RGB) img.save(auto.png) main(auto.png) end = datetime.datetime.now() print (本次一共花了+str((end-start).seconds)+)








if(oSession.host == question.hortor.net){ oSession.utilDecodeResponse(); //Decoding HTTP request in case its gzip //Saving full request object (Including HTTP headers) oSession.SaveResponse(C:\Users\XXXX\Desktop\data\response.txt,true); //Saving just body oSession.SaveResponseBody(C:\Users\XXXX\Desktop\data\responsebody.txt); }


import jsonimport timefrom urllib import parseimport requestsfrom bs4 import BeautifulSoup as BSdef get_appinf(filename): f = open(filename, r, encoding=utf-8) try: j = json.loads(f.read()) #判斷數據文件是否有題目和選項 if quiz in j[data] and options in j[data]: num = j[data][num] quiz = j[data][quiz] print((+str(num)+題:+quiz).center(50,*)+
) cho = j[data][options] else: pass return quiz,cho except: pass f.close()def search(question,choice): pagenum = [0,10,20] answer = [] for i in pagenum: q = parse.quote(question.encode(gbk)) # gbk url = https://zhidao.baidu.com/search?word= + q + &ie=gbk&site=-1&sites=0&date=0&pn= + str(i) requests.packages.urllib3.disable_warnings() # 忽視網頁安全性問題 r = requests.get(url, verify=False) # 不驗證證書 r.encoding = gbk # 網址轉gbk編碼 soup = BS(r.text, html.parser) want = soup.find(div, id=wgt-list) wants = want.find_all(dl, class_=dl) for i in wants: ans = i.find(dd, class_=dd answer).text answer.append(ans) choiceset = {} choiceset[A] = choice[0] choiceset[B] = choice[1] choiceset[C] = choice[2] choiceset[D] = choice[3] #計算四個選項在爬取百度答案中的出現次數 results = {} for i in choiceset: account = [] for j in answer: if choiceset[i] in j: account.append(j) result = len(account)/30 results[i] = result if i == D: print(( + i + 的可能性是:%.2f%% % (result * 100 )).center(50)+
) else: print(( + i + 的可能性是:%.2f%% % (result * 100 )).center(50)) #選出數值最大元素的對應鍵 bestchoice = max(results.items(), key=lambda x: x[1])[0] print ((此題最好選+bestchoice).center(50,-)+nnn)def main(): try: que,cho = get_appinf(C:/Users/XXXX/Desktop/data/responsebody.txt) #修改成你自己的保存位置 search(que,cho) except: passif __name__ == __main__: while True: main() time.sleep(2)



當爬蟲不遵守 robots 協議時,有沒有防止抓取的可能?
python 中文url 編碼如何轉換回中文?

TAG:Python | 爬虫计算机网络 | 头脑王者 |