Monday, November 19, 2012

Extract text from HTML tags

How to extract text from HTML tags using java and free tool JSOUP.
Following code uses proxy configuration as well. You can skip proxy part if its not required.

import all required classes.


public class urltest { public static void main (String[] args) throws Exception{ try { String urltext = "http://www.surface-india.com/"; URL url = new URL(urltext); //URL to extract Text Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress("host", 80)); HttpURLConnection uc = (HttpURLConnection)url.openConnection(proxy); sun.misc.BASE64Encoder encoder = new sun.misc.BASE64Encoder(); String encodedUserPwd = encoder.encode("domain\\username:password".getBytes()); uc.setRequestProperty("Proxy-Authorization", encodedUserPwd); //DO NOT TOUCH START uc.connect(); String line = null; StringBuffer tmp = new StringBuffer(); BufferedReader in = new BufferedReader(new InputStreamReader(uc.getInputStream())); while ((line = in.readLine()) != null) { tmp.append(line); } Document doc = Jsoup.parse(String.valueOf(tmp)); //DO NOT TOUCH END Elements div = doc.select("p"); //tag name String text = div.text(); System.out.println(text); } catch (Exception e) { e.printStackTrace(); } } }


Download JSOUP : http://jsoup.org/packages/jsoup-1.7.1.jar
Visit JSOUP homepage : http://jsoup.org/
Syntax to use : http://jsoup.org/cookbook/extracting-data/selector-syntax