ITEEDU

用HtmlUnit抓取雪球数据

htmlunit 是一款开源的java 页面分析工具,读取页面后,可以有效的使用htmlunit分析页面上的内容。项目可以模拟浏览器运行,被誉为java浏览器的开源实现。这个没有界面的浏览器,运行速度也是非常迅速的。

第一步:创建一个客户端

WebClient client = HttpUtil.getClient();

第二步:登录

    public static HtmlPage loginXueqiu(WebClient client) 
        throws IOException, MalformedURLException {
        HtmlPage page = (HtmlPage) client.getPage("http://xueqiu.com");
        HtmlTextInput account = null;
        HtmlForm loginForm = (HtmlForm) page.getElementById("form-login-index");
        account = (HtmlTextInput) loginForm.getInputByName("username");
        HtmlPasswordInput password = (HtmlPasswordInput) loginForm
                .getInputByName("password");
        DomNodeList btns = loginForm
                .getElementsByTagName("button");
        HtmlButton submit = (HtmlButton) btns.get(0);
        account.setValueAttribute("xueqiuclient@126.com");
        password.setValueAttribute("xueqiu");
        return (HtmlPage) submit.click();
    }

以获取股票列表为例:

    private static List getStockList(WebClient client, String url) {
        try {
            Page p = client.getPage(url);
            WebResponse wp = p.getWebResponse();
            String sJson = wp.getContentAsString();
            JSONObject.fromObject(sJson);
            JSONObject json = JSONObject.fromObject(sJson);
            JSONArray jarr = json.getJSONArray("data");
            @SuppressWarnings("unchecked")
            Iterator ite = jarr.iterator();
            List lstStock = new ArrayList();
            while (ite.hasNext()) {
                try {
                    JSONArray jstock = ite.next();
                    TStock stock = new TStock(jstock);
                    lstStock.add(stock);
                } catch (Exception e) {
                    continue;
                }
            }
            return lstStock;
        } catch (Exception e) {
            log.error("获取上证列表出错", e);
            return new ArrayList();
        }
    }

其中url是通过浏览器debug出来的,返回的是JSON数据。

    /**
     * 上证股票列表
     */
    public static final String API_SHA="http://xueqiu.com/stock/quote_order.json?page=1&size=2000" +
    		"&order=asc&exchange=CN&stockType=sha&orderBy=symbol&column=symbol%2Cname";
    /**
     * 深证股票列表
     */
    public static final String API_SZA="http://xueqiu.com/stock/quote_order.json?page=1&size=2000" +
    		"&order=asc&orderBy=symbol&exchange=CN&stockType=sza&column=symbol%2Cname";
    /**
     * 主要财务指标
     */
    public static final String API_ZYCWZB="http://xueqiu.com/stock/f10/finmainindex.json?page=1&size=40&symbol=";
    /**
     * 资产负债表
     */
    public static final String API_BALSHEET="http://xueqiu.com/stock/f10/balsheet.json?page=1&size=40&symbol=";

下载代码:

http://pan.baidu.com/s/1sjHnrzv

提取码:wr9j