网站权重与排名浅谈,中国比较有名的产品设计公司,徐州建设局网新网站,零基础学做网站的书由于网页结构跟之前有变化#xff0c;还不是很熟悉。代码待完善#xff0c;问题记录#xff1a;腾讯新闻二级网页内容爬取有问题。链家网站头文件没有用到。爬取一条腾讯视频的header内容#xff0c;存入txt。要求#xff1a;包含网页链接包含title包含所有headers信息imp…由于网页结构跟之前有变化还不是很熟悉。代码待完善问题记录腾讯新闻二级网页内容爬取有问题。链家网站头文件没有用到。爬取一条腾讯视频的header内容存入txt。要求包含网页链接包含title包含所有headers信息import requestsfrom bs4 import BeautifulSoupu http://news.qq.com/a/20170205/023923.htmr requests.get(url u)headers r.headers #获取网页头部信息#print(headers)soup BeautifulSoup(r.text,lxml)title soup.title.text #获取网页title内容#print(title)f open(C:\\Users\\Administrator\\Desktop\\lianxi\\header.txt,w,encodingutf8)f.seek(0)# 创建一个txt文件f.write(爬取网页str(u)\n)f.write(新闻标题为title\n)for i in headers:lst [i,:,headers[i],\n]f.writelines(lst)f.close()print(finished!)爬取腾讯新闻网站上某一天的某类新闻标题。要求开头‘XX年XX月XX日腾讯新闻’包括新闻标题和网址(爬取每一条新闻的内容(二级标题))import requestsfrom bs4 import BeautifulSoupimport reu http://news.qq.com/world_index.shtmlr requests.get(url u)soup BeautifulSoup(r.text,lxml)f open(C:\\Users\\Administrator\\Desktop\\lianxi\\news.txt,w,encodingutf8)f.seek(0)# 创建一个txt文件f.write(2018年8月26日腾讯新闻\n)news soup.find_all(a,hrefre.compile(http://news.qq.com/a/20180825/))#print(news)for i in news:#print(i)txt i.text.strip()#strip() 用于去掉前后空格if txt :continueelse:lst [txt,,,url,i.attrs[href]]f.writelines(lst)f.close()print(finished!)修改import requestsfrom bs4 import BeautifulSoupimport reu http://news.qq.com/world_index.shtmlr requests.get(url u)soup BeautifulSoup(r.text,lxml)f open(C:\\Users\\Administrator\\Desktop\\lianxi\\news.txt,w,encodingutf8)f.seek(0)# 创建一个txt文件f.write(2018年8月26日腾讯新闻\n)news soup.find_all(a,hrefre.compile(//new.qq.com/omn/20180826))#print(news)for i in news:#print(i)txt i.text.strip()#strip() 用于去掉前后空格if txt :continueelse:lst [txt,,,url,http:,i.attrs[href],\n]f.writelines(lst)f.close()print(finished!)添加正文内容import requestsfrom bs4 import BeautifulSoupimport reu http://news.qq.com/world_index.shtmlr requests.get(url u)soup BeautifulSoup(r.text,lxml)f open(C:\\Users\\Administrator\\Desktop\\lianxi\\news2.txt,w,encodingutf8)f.seek(0)# 创建一个txt文件f.write(2018年8月26日腾讯新闻\n)news soup.find_all(a,hrefre.compile(http://news.qq.com/a/2018))#print(news)for i in news:#print(i)txt i.text.strip()#strip() 用于去掉前后空格if txt :continueelse:ul i.attrs[href]ur requests.get(url ul)usoup BeautifulSoup(ur.text,lxml)f.write(txt\n)#打印正文f.write(正文如下\n)if usoup.body.attrs[id]P-QQ:#排除图片新闻continueelse:p usoup.find(div,idCnt-Main-Article-QQ).find_all(p)for i in p:print(i.text)f.write(i.text\n)f.write(\n)f.close()print(finished!)爬虫正确的习惯和逻辑函数式爬取用浏览器去访问headers信息r request.get(url...,headers{...})以浏览器的形式向网页进行请求头部信息headers input(粘贴头部信息)lst headers.split(\n)m[]for i in lst:key i.split(:)[0]value i.split(:)[1]m.append([str(key),str(value)])print(dict(m))def header_format(h):函数用于转译网页headers信息h输入的headers信息h input(粘贴头部信息)lst h.split(\n)m[]for i in lst:key i.split(:)[0]value i.split(:)[1]m.append([str(key),str(value)])return(dict(m))print(header_format(headers))用函数式写法的优点阅读性更强函数的可复制性便于修改爬取一条腾讯视频的header内容存入txt。函数式编写包含网页链接包含title包含所有headers信息爬取链家二手房数据-深圳import requestsfrom bs4 import BeautifulSoupimport redef url_analysis(u, h, s, n):用于分析网页最后得到一个含有二级网址的标签列表u起始网址h头部信息s二级网址包含特定字段n页码url_lst[]for i in range(1,n1):if i 1:r requests.get(urlunb1rs深圳/)else:r requests.get(urlupgstr(i)nb1rs深圳/)soup BeautifulSoup(r.text,lxml)r2 soup.find_all(a,hrefre.compile(s))for j in r2:r3 j.attrs[href]url_lst.append(r3)return(url_lst)def content(u,h):爬取网页标签信息u爬取的二级网址h头部信息r requests.get(urlu)r.encodinge utf-8soup BeautifulSoup(r.text,lxml)t soup.title.text #爬取标题toprice soup.find(div,class_price).find(span,class_total).textunprice soup.find(div,class_unitPrice).find(span,class_unitPriceValue).textarea soup.find(div,class_area).find(div,class_mainInfo).textbase soup.find(div,class_base).find(div,class_content).find_all(li)year base[-1].textpattern resblockPosition:\(.*?)\, #.*?任意字符position re.search(pattern,r.text).group(1)lng position.split(,)[0]lat position.split(,)[1]return([t,,, toprice,,, unprice,,, area,,, year,,,lng,,,lat,\n])if __name__ __main__: #main函数web_u https://sz.lianjia.com/ershoufang/web_h {Accept: text/html,application/xhtmlxml,application/xml;q0.9,image/webp,image/apng,*/*;q0.8,Accept-Encoding: gzip, deflate, br,Accept-Language: zh-CN,zh;q0.9,Connection: keep-alive,Cookie: TY_SESSION_ID93f5b43a-5dc9-4d96-b57a-a4eb78f8dc15; lianjia_uuid614ed9e0-dc25-421f-ba8b-141c574dbb47; _smt_uid5b80defd.8430805; UM_distinctid1656f670d3e4ff-02814a7ed21053-b34356b-1fa400-1656f670d3fdd7; _jzqx1.1535172349.1535172349.1.jzqsrbj%2Elianjia%2Ecom|jzqct/.-; _gaGA1.2.50227061.1535172352; ljrefpc_sem_baidu_ppzq_x; lianjia_ssiddbe87b29-353a-45c2-97cf-aae666e2771b; Hm_lvt_9152f8221cb6243a53c83b956842be8a1535172349,1535201139,1535358484; _jzqa1.3976151446564617700.1535172349.1535201139.1535358484.3; _jzqc1; _jzqy1.1535201139.1535358484.1.jzqsrbaidu|jzqct%E9%93%BE%E5%AE%B6%E7%BD%91.-; _jzqckmp1; _gidGA1.2.1182771159.1535358486; select_city440300; all-ljc32edd623b8a5a59c7de54c92107bb6c; _qzjc1; CNZZDATA1255849469275538323-1535355329-%7C1535355329; CNZZDATA12545259481806440598-1535354494-%7C1535354494; CNZZDATA125563328472361912-1535358081-%7C1535358081; CNZZDATA12556040821229464985-1535356409-%7C1535356409; Hm_lpvt_9152f8221cb6243a53c83b956842be8a1535359605; _qzja1.1736056849.1535358739249.1535358739249.1535358739249.1535359600160.1535359605575.0.0.0.10.1; _qzjb1.1535358739249.10.0.0.0; _qzjto10.1.0; _jzqb1.15.10.1535358484.1,Host: sz.lianjia.com,Referer: https,Upgrade-Insecure-Requests: 1,User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36}web_s https://sz.lianjia.com/ershoufang/105web_n 3f open(C:\\Users\\Administrator\\Desktop\\lianxi\\lianjia.txt,w)f.seek(0)f.write(title,total_price万元,unprice元/平方米,area平方米,产权年限,lng,lat\n)for i in url_analysis(web_u, web_h, web_s, web_n):data content(i,web_h)f.writelines(data)print(data)f.close()print(finished!)