1. 什么是Ajax
1.1 实例引入
1.2 基本原理
发送请求
var xmlhttp;
if (window.XMLHttpRequest) {
xmlhttp = new XMLHttpRequest();
} else {
xmlhttp = new ActiveXObject("Microsoft.XMLHTTP");
}
// 监听服务器返回响应
xmlhttp.onreadystatechange = function() {
// 解析响应内容
if (xmlhttp.readyState == 4 && xmlhttp.status == 200) {
document.getElementById("myDiv").innerHTML = xmlhttp.responseText;
}
}
// 打开服务器链接
xmlhttp.open("POST", "/ajax/", true);
// 向服务器发送强求
xmlhttp.send();
解析内容
var xmlhttp;
if (window.XMLHttpRequest) {
xmlhttp = new XMLHttpRequest();
} else {
xmlhttp = new ActiveXObject("Microsoft.XMLHTTP");
}
xmlhttp.onreadystatechange = function() {
// 解析响应内容
if (xmlhttp.readyState == 4 && xmlhttp.status == 200) {
document.getElementById("myDiv").innerHTML = xmlhttp.responseText;
}
}
xmlhttp.open("POST", "/ajax/", true);
xmlhttp.send();
渲染网页
var xmlhttp;
if (window.XMLHttpRequest) {
xmlhttp = new XMLHttpRequest();
} else {
xmlhttp = new ActiveXObject("Microsoft.XMLHTTP");
}
xmlhttp.onreadystatechange = function() {
if (xmlhttp.readyState == 4 && xmlhttp.status == 200) {
// 更改网页内容
document.getElementById("myDiv").innerHTML = xmlhttp.responseText;
}
}
xmlhttp.open("POST", "/ajax/", true);
xmlhttp.send();
2. Ajax分析方法
2.1 分析案例
- 微博 (weibo.cn)
- Ajax 的请求类型:xhr
- 如果 Request Headers 中有一个信息为XMLHttpRequest,则此请求就是 Ajax 请求
- 可以在 Preview 查看响应的内容
- 也可以在 Response 查看真实返回的数据
2.2 过滤请求
3. Ajax分析与爬取实战
3.1 爬取目标
3.2 初步探索
import requests
url = "https://spa1.scrape.center/"
html = requests.get(url).text
print(html)
- 获取到的 html 资源较少
- 整个页面都是JavaScript渲染得到的,浏览器执行了HTML中引用的JavaScript文件,JavaScript通过调用一些数据加载和页面渲染方法,才最终呈现出浏览器中的显示效果
- 数据一般是通过Ajax加载的,JavaScript在后台调用Ajax数据接口
3.3 爬取列表页
分析
实现
基础配置
import logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s %(levelname)s: %(message)s')
INDEX_URL = "https://spa1.scrape.center/api/movie/?limit={limit}&offset={offset}"
爬取页面内容(获取页面的JSON内容)
import requests
def scrape_api(url):
logging.info(f"scraping {url}...")
try:
response = requests.get(url)
if response.status_code == 200:
return response.json()
logging.error(
f"Status code: {response.status_code} while scraping {url}")
except requests.RequestException:
logging.error(f"Error while scraping {url}", exc_info=True)
爬取列表页(爬取指定列表页)
LIMIT = 10
def scrape_index(page):
url = INDEX_URL.format(limit=LIMIT, offset=LIMIT * (page - 1))
return scrape_api(url)
合并
import logging
import requests
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s %(levelname)s: %(message)s')
INDEX_URL = "https://spa1.scrape.center/api/movie/?limit={limit}&offset={offset}"
LIMIT = 10
def scrape_api(url):
logging.info(f"scraping {url}...")
try:
response = requests.get(url)
if response.status_code == 200:
return response.json()
logging.error(
f"Status code: {response.status_code} while scraping {url}")
except requests.RequestException:
logging.error(f"Error while scraping {url}", exc_info=True)
def scrape_index(page):
url = INDEX_URL.format(limit=LIMIT, offset=LIMIT * (page - 1))
return scrape_api(url)
3.4 爬取详情页
分析
实现
爬取详情页
DETAIL_URL = "https://spa1.scrape.center/api/movie/{id}"
def scrape_detail(id):
url = DETAIL_URL.format(id=id)
return scrape_api(url)
串联调用
TOTAL_PAGE = 10
def main():
for page in range(1, TOTAL_PAGE + 1):
index_data = scrape_index(page)
for item in index_data.get("results"):
id = item.get("id")
detail_data = scrape_detail(id)
logging.info(f"detail data {detail_data}")
if __name__ == "__main__":
main()
合并
import logging
import requests
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s %(levelname)s: %(message)s')
INDEX_URL = "https://spa1.scrape.center/api/movie/?limit={limit}&offset={offset}"
DETAIL_URL = "https://spa1.scrape.center/api/movie/{id}"
LIMIT = 10
TOTAL_PAGE = 10
def scrape_api(url):
logging.info(f"scraping {url}...")
try:
response = requests.get(url)
if response.status_code == 200:
return response.json()
logging.error(
f"Status code: {response.status_code} while scraping {url}")
except requests.RequestException:
logging.error(f"Error while scraping {url}", exc_info=True)
def scrape_index(page):
url = INDEX_URL.format(limit=LIMIT, offset=LIMIT * (page - 1))
return scrape_api(url)
def scrape_detail(id):
url = DETAIL_URL.format(id=id)
return scrape_api(url)
def main():
for page in range(1, TOTAL_PAGE + 1):
index_data = scrape_index(page)
for item in index_data.get("results"):
id = item.get("id")
detail_data = scrape_detail(id)
logging.info(f"detail data {detail_data}")
if __name__ == "__main__":
main()
3.5 保存数据(MongoDB)(后期补充)
原文地址:https://blog.csdn.net/BlackOrnate/article/details/134735396
本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。
如若转载,请注明出处:http://www.7code.cn/show_23940.html
如若内容造成侵权/违法违规/事实不符,请联系代码007邮箱:suwngjj01@126.com进行投诉反馈,一经查实,立即删除!
声明:本站所有文章,如无特殊说明或标注,均为本站原创发布。任何个人或组织,在未征得本站同意时,禁止复制、盗用、采集、发布本站内容到任何网站、书籍等各类媒体平台。如若本站内容侵犯了原著者的合法权益,可联系我们进行处理。