WebMagic结合Selenium爬取网页

发布于 2019-06-01  22 次阅读


最近在爬取WOS上的参考文献,html中只加载了部分,其余是js加载的。由于赶时间交作业,就直接试了下利用selenium爬取,顺便记录下来。

Maven配置

除了webmagic,还需要引如下包

    <dependency>
      <groupId>us.codecraft</groupId>
      <artifactId>webmagic-selenium</artifactId>
      <version>0.7.3</version>
    </dependency>
    <dependency>
      <groupId>org.seleniumhq.selenium</groupId>
      <artifactId>selenium-java</artifactId>
      <version>3.0.1</version>
    </dependency>
    <dependency>
      <groupId>org.seleniumhq.selenium</groupId>
      <artifactId>selenium-chrome-driver</artifactId>
      <version>3.0.1</version>
    </dependency>
    <dependency>
      <groupId>org.seleniumhq.selenium</groupId>
      <artifactId>selenium-server</artifactId>
      <version>2.18.0</version>
    </dependency>

安装驱动&配置文件

sudo apt-get install chromium-chromedriver

配置文件如下,chrome_exec_path根据实际路径填写

driver=chrome
chrome_exec_path=/usr/bin/google-chrome-stable
chrome_driver_loglevel=DEBUG

使用方法

首先设置配置文件路径

System.setProperty("selenuim_config", "/home/likole/tmp/selenuim/config.ini");

在Spider.create(xxx).后添加.setDownloader(new SeleniumDownloader())即可

样例

该样例为爬取wos上的参考文献,多页显示的目前只爬取了一页

package preprocess.spider.reference;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.downloader.selenium.SeleniumDownloader;
import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.processor.PageProcessor;

public class WOSPageProcessor implements PageProcessor {

    private Site site = Site.me().setRetryTimes(3).setSleepTime(1000).addHeader("cookie", "**********");

    @Override
    public void process(Page page) {
        //wos
        String wos=page.getHtml().xpath("//input[@name='00N70000002n880']/@value").get();
        if(wos.length()==19){
            page.putField("fromWOS",wos.substring(4));
        }

        //title
        page.putField("fromTitle", page.getHtml().xpath("//div[@class='title']/allText()").get());

        //reference documents
        page.putField("toTitles", page.getHtml().xpath("//a[@class='smallV110 snowplow-full-record']/allText()").all());
        page.putField("toWOSs", page.getHtml().xpath("//a[@class='smallV110 snowplow-full-record']/@href").all());

        //skip
        if (page.getResultItems().get("fromWOS") == null) {
            page.setSkip(true);
        }
        page.addTargetRequest(page.getHtml().xpath("//form[@id='paginationForm']//a[2]").links().get());
    }

    @Override
    public Site getSite() {
        return site;
    }

    public static void main(String[] args) {
        //注意此处设置为配置文件路径
        System.setProperty("selenuim_config", "/home/likole/tmp/selenuim/config.ini");
        Spider.create(new WOSPageProcessor())
                .addUrl("http://apps.webofknowledge.com/full_record.do?product=WOS&search_mode=GeneralSearch&qid=33&SID=6BYRHPnTGnonsmCbQU6&page=1&doc=500")
                .setDownloader(new SeleniumDownloader())
                .addPipeline(new DbPipeline())
                .addPipeline(new ConsolePipeline())
                .thread(5)
                .run();
    }
}
package preprocess.spider.reference;

import com.likole.aihw.bean.ArticleReference;
import preprocess.utils.DbUtils;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;

import java.util.List;

/**
 * @author likole
 */
public class DbPipeline implements Pipeline {

    @Override
    public void process(ResultItems resultItems, Task task) {
        String fromWOS=resultItems.get("fromWOS");
        String fromTitle=resultItems.get("fromTitle");
        List<String> toWOSs=resultItems.get("toWOSs");
        List<String> toTitles=resultItems.get("toTitles");
        for (int i=0;i<toWOSs.size();i++){
            ArticleReference articleReference =new ArticleReference();
            articleReference.setFromWOS(fromWOS);
            articleReference.setFromTitle(fromTitle);
            articleReference.setToWOS(toWOSs.get(i).substring(toWOSs.get(i).indexOf("WOS:")+4));
            articleReference.setToTitle(toTitles.get(i));
            DbUtils.getDao().insertOrUpdate(articleReference);
        }
    }
}
package com.likole.aihw.bean;

import org.nutz.dao.entity.annotation.*;

@Table
@PK({"fromWOS","toWOS"})
public class ArticleReference {
    @Column
    private String fromWOS;

    @Column
    @ColDefine(type = ColType.TEXT)
    private String fromTitle;

    @Column
    private String toWOS;

    @Column
    @ColDefine(type = ColType.TEXT)
    private String toTitle;

    public String getFromWOS() {
        return fromWOS;
    }

    public void setFromWOS(String fromWOS) {
        this.fromWOS = fromWOS;
    }

    public String getFromTitle() {
        return fromTitle;
    }

    public void setFromTitle(String fromTitle) {
        this.fromTitle = fromTitle;
    }

    public String getToWOS() {
        return toWOS;
    }

    public void setToWOS(String toWOS) {
        this.toWOS = toWOS;
    }

    public String getToTitle() {
        return toTitle;
    }

    public void setToTitle(String toTitle) {
        this.toTitle = toTitle;
    }
}
package preprocess.utils;

import org.nutz.dao.Dao;
import org.nutz.dao.impl.NutDao;
import org.nutz.dao.impl.SimpleDataSource;

/**
 * @author likole
 */
public class DbUtils {
    private static Dao dao;


    public static Dao getDao() {
        if (dao == null) {
            SimpleDataSource dataSource = new SimpleDataSource();
            dataSource.setJdbcUrl("jdbc:mysql://127.0.0.1/aihw");
            dataSource.setUsername("aihw");
            dataSource.setPassword("2CNctX4ht6Rhe6yp");
            dao = new NutDao(dataSource);
        }
        return dao;
    }


}