柠檬叔的博客

# importing the modules
from bs4 import BeautifulSoup
import os,sys
import requests
from app import app
from flask_appbuilder.models.sqla.interface import SQLAInterface
from app import appbuilder, db
from app.models import Notes
import logging
log = logging.getLogger(__name__)



def sav_to_db(title,f_content):
	try:
		db.session.add(Notes(title=title, content=f_content,created_by_fk=1,changed_by_fk=1))
		db.session.commit()
	except Exception as e:
		log.error("Notes creation error: %s", e)
		db.session.rollback()
		exit(1)


def scraping(url):
	debug = {'verbose': sys.stderr}
	headers = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0)'}
	page = requests.get(url,headers=headers)
	page.encoding = 'utf-8'
	soup = BeautifulSoup(page.text, 'html.parser')
	article = soup.find("div", class_="note-container")
	title = article.find("h1")
	notes = article.find_all("div", class_="note")
	print(title)
	print(notes[1])
	#<h1>榨菜肉丝面</h1>
	#<div class="note">
	#<div class="image-container image-float-center">
	#<div class="image-wrapper">
	#<img height="auto" src="https://cfa88nrmbjjfatjz.public.blob.vercel-storage.com/legacy/uploads/p89110217-476b0b9889b5.jpg"/>
	#</div>
	#</div>
	#<div class="image-container image-float-center">
	#<div class="image-wrapper">
	#<img height="auto" src="https://cfa88nrmbjjfatjz.public.blob.vercel-storage.com/legacy/uploads/p89110227-0b1314385c69.jpg"/>
	#</div>
	#</div>
	#<div class="image-container image-float-center">
	#<div class="image-wrapper">
	#<img height="auto" src="https://cfa88nrmbjjfatjz.public.blob.vercel-storage.com/legacy/uploads/p89110216-52a40fae29df.jpg"/>
	#</div>
	#</div>
	#<div class="image-container image-float-center">
	#<div class="image-wrapper">
	#<img height="auto" src="https://cfa88nrmbjjfatjz.public.blob.vercel-storage.com/legacy/uploads/p89110224-1a22ddd94099.jpg"/>
	#</div>
	#</div>
	#<div class="image-container image-float-center">
	#<div class="image-wrapper">
	#<img height="auto" src="https://cfa88nrmbjjfatjz.public.blob.vercel-storage.com/legacy/uploads/p89110232-06e77f7991fe.jpg"/>
	#</div>
	#</div>
	#<div class="image-container image-float-center">
	#<div class="image-wrapper">
	#<img height="auto" src="https://cfa88nrmbjjfatjz.public.blob.vercel-storage.com/legacy/uploads/p89110223-dab6e86cbcef.jpg"/>
	#</div>
	#</div>
	#<div class="image-container image-float-center">
	#<div class="image-wrapper">
	#<img height="auto" src="https://cfa88nrmbjjfatjz.public.blob.vercel-storage.com/legacy/uploads/p89110230-476b0b9889b5.jpg"/>
	#</div>
	#</div>
	#<p data-align="left">不错，原来汤面的核心是做汤底</p>
	#</div>
	#怎么处理这个content是我的烦恼了看来是
	#第一步：
	#replace掉这个多余的东西
	#<div class="image-container image-float-center">
	#
	#第二步：
	#replace掉所有的关闭
	#/></div></div>
	#to
	#</p>
	#
	#第三步：
	#replace掉这个东西
	#<div class="note">
	#
	#第四步：
	#replace
	#<div class="image-wrapper">
	#to 
	#<p>
	note = str(notes[1])
	step1 = note.replace('<div class="image-container image-float-center">',"")
	#print("STEP1 reasult:========================")
	#print(step1)
	step2 = step1.replace('/></div></div>',"</p>")
	#print("STEP2 reasult:========================")
	#print(step2)
	step3 = step2.replace('<div class="note">',"")
	#print("STEP3 reasult:========================")
	#print(step3)
	step4 = step3.replace('<div class="image-wrapper">',"<p>")
	#print("STEP4 reasult:========================")
	#print(step4)
	step5 = step4.replace('</div>',"")
	#print("STEP5 reasult:========================")
	#print(step5)
	step6 = step5.replace('https://img1.doubanio.com/view/note/l/public/',"/static/uploads/")
	step7 = step6.replace('https://img2.doubanio.com/view/note/l/public/',"/static/uploads/")
	step8 = step7.replace('https://img3.doubanio.com/view/note/l/public/',"/static/uploads/")
	step9 = step8.replace('https://img4.doubanio.com/view/note/l/public/',"/static/uploads/")
	step10 = step9.replace('https://img5.doubanio.com/view/note/l/public/',"/static/uploads/")
	step11 = step10.replace('https://img6.doubanio.com/view/note/l/public/',"/static/uploads/")
	step12 = step11.replace('https://img7.doubanio.com/view/note/l/public/',"/static/uploads/")
	step13 = step12.replace('https://img8.doubanio.com/view/note/l/public/',"/static/uploads/")
	step14 = step13.replace('https://img9.doubanio.com/view/note/l/public/',"/static/uploads/")
	step15 = step14.replace('https://img10.doubanio.com/view/note/l/public/',"/static/uploads/")
	step16 = step15.replace('https://img11.doubanio.com/view/note/l/public/',"/static/uploads/")
	print("STEP16 reasult:========================")
	print(step16)
	#OK,第四步其实就已经是我想要的结果了
	#接下来该处理所有的图片
	imgs = notes[1].find_all("img")
	print("images reasult:========================")
	for img in imgs:
		img_src = img["src"]
		root = './app/static/uploads/'
		path = root + img_src.split('/')[-1]
		print("remote images src:========================")
		print(img_src)
		print("local images src:========================")
		print(path)
		#这里本来还应该有个try的，但是为了调试方便，不放了，没有考虑图片重名的问题
		if not os.path.exists(root):
			print("root not exists")
			os.mkdir(root)
		if not os.path.exists(path):
			r=requests.get(img_src,headers=headers)
			with open(path,"wb") as f:
				f.write(r.content)
				f.close
				print("file save succ")
		else:
			print("file exists")
	#最终需要存储的标题
	title = title.getText()
	#最终需要存储的内容
	f_content = step16
	print(title)
	print(f_content)
	sav_to_db(title,f_content)


#=========main=============#
s_url = sys.argv[1]
scraping(s_url)
迁移豆瓣日志20220524