写了一个简单的基于chreeio的数据爬取代码,node http的模块获取html 然后用chreeio爬取。
想使用最新的async await 来异步操作.代码如下:
let request = require('request');
let cheerio = require('cheerio');
let fs = require('fs');
let http = require('http');
let https = require('https');
let newCollection = [];
let newCollection_item = {
title: '',
href: ''
};
const hrefPrefix = 'xxxxxxxxx'
class Crawler {
static test() {
// await getList();
//想在这里直接获取newCollection
console.log(this.getList());
}
//这里是一个异步获取list
static async getList() {
return new Promise((resolve, reject) => {
http.get("http://xxxx/xxxxxx.com", (res) => {
res.setEncoding('utf-8');
let html = "";
res.on('data', (chunk) => {
html += chunk;
});
res.on('end', () => {
let $ = cheerio.load(html);
//爬取数据
$('#ajaxtable tbody tr').each(function(index, item) {
if (index > 4) {
let title = $(this).children().eq(1).find('h3').find('a').text();
let href = `${hrefPrefix}` + $(this).children().eq(1).find('h3').find('a').attr('href');
newCollection_item = { title, href };
newCollection.push(newCollection_item);
}
}, this);
resolve(res);
// console.log("=============================================list=================================
", newCollection)
})
})
})
return newCollection;
}
}
Crawler.test();
不知道哪里写错了,目的就是想在test()里面能直接拿到getList返回的newCollection;