基于Charles数据采集自动化的一次实践_sinat
前提
数据采集,一般直接调用数据传输接口。当数据传输接口的调用过于复杂时,比如:请求参数多层或自定义加密等,这个时候不妨考虑基于第三方代理软件直接拦截传输的数据。
拦截传输的数据,完整流程:设置第三方代理、程序模拟人操作、输出第三方代理软件拦截的数据、解析并存储数据。本文的第三方代理软件为Charles,故以上流程基于Charles进行演示。
设置第三方代理
进入Charles主页面后,如下依次点击,查看代理的IP与监控的端口。
如果使用Selenium等模拟人操作网页,可忽略此步;如果使用Appium等模拟人操作手机APP页面,不可忽略此步。
模拟人操作
可使用Selenium等模拟人操作网页、使用Appium等模拟人操作手机APP页面等,此处不作累述。
输出第三方代理软件拦截的数据
默认情况下,Charles会展示所有拦截的请求。
为方便查看数据接口且提高数据处理的性能,Charles提供请求过滤功能;可如下依次点击。
Charles提供分钟级的请求自动导出功能,可指定导出路径和导出文件的格式。
解析并存储数据
以“JSON Session File”为例,数据结构如下,可根据需要进行解析。
{
"status": "COMPLETE",
"method": "POST",
"protocolVersion": "HTTP/1.1",
"scheme": "http",
"host": "210.52.217.135",
"port": 80,
"actualPort": 80,
"path": "/cloudquery.php",
"query": null,
"tunnel": false,
"keptAlive": false,
"webSocket": false,
"remoteAddress": "210.52.217.135/210.52.217.135",
"clientAddress": "/127.0.0.1",
"clientPort": 56166,
"times": {
"start": "2021-10-08T10:52:33.578+08:00",
"requestBegin": "2021-10-08T10:52:33.586+08:00",
"requestComplete": "2021-10-08T10:52:33.586+08:00",
"responseBegin": "2021-10-08T10:52:33.653+08:00",
"end": "2021-10-08T10:52:33.654+08:00"
},
"durations": {
"total": 75,
"dns": 0,
"connect": 7,
"ssl": null,
"request": 0,
"response": 1,
"latency": 67
},
"speeds": {
"overall": 31666,
"request": 0,
"response": 703000
},
"totalSize": 2375,
"request": {
"sizes": {
"headers": 334,
"body": 1338
},
"mimeType": "multipart/form-data",
"charset": null,
"contentEncoding": null,
"header": {
"firstLine": "POST /cloudquery.php HTTP/1.1",
"headers": [{
"name": "User-Agent",
"value": "Post_Multipart"
},
{
"name": "Host",
"value": "210.52.217.135"
},
{
"name": "Accept",
"value": "*/*"
},
{
"name": "Pragma",
"value": "no-cache"
},
{
"name": "X-360-Cloud-Security-Desc",
"value": "Scan Suspicious File"
},
{
"name": "x-360-ver",
"value": "4"
},
{
"name": "Content-Length",
"value": "1338"
},
{
"name": "Content-Type",
"value": "multipart/form-data; boundary=----------------------------a692a93e79cd"
},
{
"name": "Cache-Control",
"value": "no-cache"
},
{
"name": "Connection",
"value": "keep-alive"
}]
},
"body": {
"encoding": "base64",
"encoded": "LS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tYTY5MmE5M2U3OWNkDQpDb250ZW50LURpc3Bvc2l0aW9uOiBmb3JtLWRhdGE7IG5hbWU9Im0iDQoNCgoEBKhewAABAADyV+KtCjNgKSDLdD2n/exxS46lIe02XmWPBtkQboV5X4TJU6aG+w+FFIVzeF9becFANl9uXrdQYKkH9ET8uKHp5ra+045buG+IjRD4DdG1d0VeMeCUsQN3kNhteJuepNe59eftz7o+2beqmSPfsfd4Zw3eYqw+KH4FB8wcd4F7GmmMp2phzgaZE18GfH83byQWMhWSKMQziED9DDiie/m4clnY9gnmiFcIZC5Wm//n+Ck3o5xQjHqB1Jd6c2rPHR1gGPVk3JNBCXiuWGB5e/OpvwWpBlze3ICJQ3X0yDR18xPrKXdqOqP/CKGu3EkSj5OzY0eHdoSHLddiglYr1Htfn8WXVKfQQIS3v3K4s+ZXB5ffuoq5X67msU55yyHZgqRejf0XpcJ3iiW1VldoDUDCxog1NS95o5zhCk+IUstm59AuK9WWhmax0QaQ3dQMPSIz2PiTSYbYrIpOh+3sOBd77MMZmxD2OAziybPoWT79XmxsFrwL/dI2bBUvZ0Ve19AHlTMdUYYyFPkbYzvMoAAHbZOKRn4Tvp8C1G5e5Qosw4ccY+Mv+214cOdAdA3x/nDvOQxs9DQW1wCfKY38vmYawbLT2myNk7aKYNXUu6fLbZEapDOMJxTu3d9rcY+fE8rogbYIrDR45nky+7Lpvv8mJ//GLpCQjeW4oSF7Boywe4rwi/hfroLinaorR6EpN1E63+Y1eYgjqkVh2fe9AB4EEnhhWimx0xE6/c15SpnNNXr+3LyZqKv38E8jo9qVw7sqEAzG1lKzZRqcFeovJntqDgMoy1DTgRfomYpV4bOWVIh8636e1YwJvc5iXaZ7n/a2yhgMsVqxXyblNHWOH82bR4aDyS31F3pdk6MD2WXtTkiSjfNyaZ2stuWGlgnz2t+YVZJ4xxS3/ePjrCMY3lg29+3az/WdvS9E/5qo6poIWmgSnkE1UjDMO5UBvq0RDeVbW9XVzpKy7tVpScjTrGBPKehMwQf0+aS9Hy/NkSblmciUuIUp2aGcINRIjrrILtiYUd/POE3PBFzIxzKlc2MySKBNcMlgZmjGpTeoG/NiK4R9Z4yxkPyPvC9bCelTftEkY2KFzJtcZfXI/IujqA6X0CNeSiQlQVl1rA2EpKMLqlviQZ7m95GgqCI0YJWDaTxd8zsYbsCbAxK38psabThfbCWmcHMUjiHQbJJtw7F4+/KaRnzm+L18lmHYeY55dnZroPsOR24rRZtXTbBjlLRg2QWnkzVtlhGao/wXJVAz03LFmMTX+YiHioCou4ShnW/DaMfayhZctY93Gd5PWk3lpCk4UEPo7pJtbXXsVJu0jxzPxlk1uCfbTPz9q61dEX1QKSjdkUYtLzHM5sCx2vTEQ2FEVAVPSgOwXMJyWD1V4BLcMr3zKVnR35wg/TB7qWgNjpavEn6L5QYr/RNPHvSL7QK+Ndp29daGABr9fbEJQjTlR6NmmY8BQaET5mfB2F8+F39OZfRRACXKr7rwoeXkv7IWdMs2hN/JxJ+QpCUHhh0Zb3RmrXAVyBA4zRBosaiB6mGMQz9DLapGWmlQNXWl5kpSHZxNKWz3uRuRU/uyavpx0aHUYHFcDQotLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS1hNjkyYTkzZTc5Y2QtLQ0K"
}
},
"response": {
"status": 200,
"sizes": {
"headers": 236,
"body": 467
},
"mimeType": "application/octet-stream",
"charset": null,
"contentEncoding": null,
"header": {
"firstLine": "HTTP/1.1 200 OK",
"headers": [{
"name": "Server",
"value": "nginx"
},
{
"name": "Date",
"value": "Fri, 08 Oct 2021 02:52:15 GMT"
},
{
"name": "Content-Type",
"value": "application/octet-stream"
},
{
"name": "Transfer-Encoding",
"value": "chunked"
},
{
"name": "Cache-Control",
"value": "no-cache"
},
{
"name": "pragma",
"value": "no-cache"
},
{
"name": "Expires",
"value": "0"
},
{
"name": "Proxy-Connection",
"value": "close"
}]
},
"body": {
"encoding": "base64",
"encoded": "CgQBv17AAAEAABECUEFJrq6Td0kAiSrClNUoidc4a+3QyOM0gJi/48D8GBcYMa4/lcmPwdvJcPivJ/RwB8yroOFPbPDvng/24w9uNMgyzdFPhkPbmC02tznpo4YqBpsHnzz/UqcLoQahAPFVpw73D6sGp1OlVasA8wb2BKsOpQ/3BasDogW5UqEOplOrUKYGpQSnBKMHp1WjAqRQ9gSkB/AH9geqAqdV9wWiVacApT+jBrwFoD+iP6I/oj+gA5sDmz+bZtp45Gz8ZOhV/3u5VaB/pHvGd6N71mejePhV6lDaTuhv/wKkA95P0APeX/MD8ADnA94d+mbRD+hUoGyiVaF8+Gb1C68/mwabBps8qlOkA/cGoQWlD6tT8A/0UqYA8FWnVaMOplOlUqNUqgO5U6NV8QOhB6FU9wf0AfZQplWmBaQPpVCqUKUGowWiA6cOp1D3AvcBoz+jBrwGmwabBpsGmweiBKs/pT+bP8Jw1GHGYqdhxHDAT/Zu3AbDYMh59mGiAN9y5UDHYMh4wlzqWfNu0EzCXadzx3DYZsRzq2LdXNQO9HOnc8MG1GPdWNxBygTURfBy5UDzcf5B8UymC5s/oj+iP5g7mA=="
}
}
}
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58
- 59
- 60
- 61
- 62
- 63
- 64
- 65
- 66
- 67
- 68
- 69
- 70
- 71
- 72
- 73
- 74
- 75
- 76
- 77
- 78
- 79
- 80
- 81
- 82
- 83
- 84
- 85
- 86
- 87
- 88
- 89
- 90
- 91
- 92
- 93
- 94
- 95
- 96
- 97
- 98
- 99
- 100
- 101
- 102
- 103
- 104
- 105
- 106
- 107
- 108
- 109
- 110
- 111
- 112
- 113
- 114
- 115
- 116
- 117
- 118
- 119
- 120
- 121
- 122
- 123
- 124
- 125
- 126
- 127
- 128
- 129
- 130
- 131
- 132
- 133
- 134
- 135
- 136
- 137
- 138
- 139
- 140
- 141
- 142
- 143
- 144
补充
拦截的请求里除了数据外,还有Url、请求头、请求体等。有些网站Cookie、Session等爬虫资源更新的频率较低时,该方案可用作相应的爬虫资源的更新。
推荐阅读