How to extract content from a PDF file in java
Here I am extreacting last 200 characters of a PDF file.
import java.io.File;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
public class PDFReader {
/**
* @param args
*/
public static void main(String[] args) {
{
try
{
PDDocument pddDocument=PDDocument.load(new File("C:/Users/mkum63/Desktop/vit-strategic-growth-inst-sp.pdf"));
System.out.println(pddDocument.getNumberOfPages());
PDFTextStripper textStripper=new PDFTextStripper();
String text=textStripper.getText(pddDocument);
//System.out.println(textStripper.getEndPage());
System.out.println(text.subSequence(text.length()-200, text.length()));
pddDocument.close();
}
catch(Exception ex)
{
ex.printStackTrace();
}
}
}
}
Jar files required :
commons-logging-api-1.1.1
fontbox-1.2.1
pdfbox-1.3.1
Here I am extreacting last 200 characters of a PDF file.
import java.io.File;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
public class PDFReader {
/**
* @param args
*/
public static void main(String[] args) {
{
try
{
PDDocument pddDocument=PDDocument.load(new File("C:/Users/mkum63/Desktop/vit-strategic-growth-inst-sp.pdf"));
System.out.println(pddDocument.getNumberOfPages());
PDFTextStripper textStripper=new PDFTextStripper();
String text=textStripper.getText(pddDocument);
//System.out.println(textStripper.getEndPage());
System.out.println(text.subSequence(text.length()-200, text.length()));
pddDocument.close();
}
catch(Exception ex)
{
ex.printStackTrace();
}
}
}
}
Jar files required :
commons-logging-api-1.1.1
fontbox-1.2.1
pdfbox-1.3.1