java读取html文件,并获取body中所有的标签及内容的案例-APISpace

java读取html文件,并获取body中所有的标签及内容的案例

这里的获取的是html文件中body中的所有标签以及内容

package com.lmt.service.file;

import java.io.BufferedReader;

import java.io.File;

import java.io.FileInputStream;

import java.io.InputStreamReader;

import java.io.Reader;

import org.springframework.stereotype.Component;

import com.lmt.config.UrlConstants;

@Component

public class ParseFile {

/**

* 解析html文件

* @param file

* @return

public String readHtml(File file){

String body = "";

try {

FileInputStream iStream = new FileInputStream(file);

Reader reader = new InputStreamReader(iStream);

BufferedReader htmlReader = new BufferedReader(reader);

String line;

boolean found = false;

while (!found && (line = htmlReader.readLine()) != null) {

if (line.toLowerCase().indexOf("

found = true;

}

found = false;

while (!found && (line = htmlReader.readLine()) != null) {

if (line.toLowerCase().indexOf("

found = true;

} else {

// 如果存在图片，则将相对路径转换为绝对路径

String lowerCaseLine = line.toLowerCase();

if (lowerCaseLine.contains("src")) {

//这里是定义图片的访问路径

String directory = "D:/test";

// 如果路径名不以反斜杠结尾，则手动添加反斜杠

/*if (!directory.endsWith("\\")) {

directory = directory + "\\";

}*/

// line = line.substring(0, lowerCaseLine.indexOf("src") + 5) + directory + line.substring(lowerCaseLine.indexOf("src") + 5);

/*String filename = extractFilename(line);

line = line.substring(0, lowerCaseLine.indexOf("src") + 5) + directory + filename + line.substring(line.indexOf(filename) + filename.length());

// 如果该行存在多个元素，则分行进行替代

String[] splitLines = line.split("

// 因为java中引用的问题不能使用for eachZDaPdQI

for (int i = 0; i < splitLines.length; i++) {

if (splitLines[i].toLowerCase().startsWith("src")) {

splitLines[i] = splitLines[i].substring(0, splitLines[i].toLowerCase().indexOf("src") + 5)

+ directory

+ splitLines[i].substring(splitLines[i].toLowerCase().indexOf("src") + 5);

}

// 最后进行拼接

line = "";

for (int i = 0; i < splitLines.length - 1; i++) { // 循环次数要-1，因为最后一个字符串后不需要添加

line = line + splitLines[i] + "

}

line = line + splitLines[splitLines.length - 1];

}

body = body + line + "\n";

}

htmlReader.close();

// System.out.println(body);

} catch ZDaPdQI(Exception e) {

e.printStackTrace();

}

return body;

}

/**

* @param htmlLine 一行html片段，包含元素

* @return 文件名

public static String extractFilename(String htmlLine) {

int srcIndex = htmlLine.toLowerCase().indexOf("src=");

if (srcIndex == -1) { // 图片不存在，返回空字符串

return "";

} else {

String htmlSrc = htmlLine.substring(srcIndex + 4);

char splitChar = '\"'; // 默认为双引号，但也有可能为单引号

if (htmlSrc.charAt(0) == '\'') {

splitChar = '\'';

}

String[] firstSplit = htmlSrc.split(String.valueOf(splitChar));

String path = firstSplit[1]; // 第0位为空字符串

String[] secondSplit = path.split("[/\\\\]"); // 匹配正斜杠或反斜杠

return secondSplit[secondSplit.length - 1];

}

补充知识：StandardEngine[Catalina].StandardHost[localhost].StandardContext[]

jar包没有正确导入

1、在 build path 中添加

2、如果这里不添加在编译的时你的jar包将不会被导入

3、如果依然没有成功请删除user jar包重新导入

Linux中怎么用cat命令创建文件并写入数据

277 2023-03-25

java读取html文件,并获取body中所有的标签及内容的案例

linux怎么查看本机内存大小

Linux中怎么用cat命令创建文件并写入数据

mysql连接测试不成功的原因有哪些

推荐文章

api接口有哪几种分类及功能

什么是API接口?API接口简单介绍

短信API接口概述，短信API接口的优势

7款快递物流的物流查询API工具，物流快递查询API接口怎么对接？

企业四要素: 了解企业经营成功的关键

什么是语音验证码?,语音验证码平台有哪些

全国工商查询系统怎么查企业名录

哪些平台提供实名认证的接口？

PHP如何调用API接口?

如何使用百度天气预报API接口?

最近发表

热评文章

数据接口api（数据接口API开发平台）

数据开放接口api（数据服务api开发）

Python爬虫教程：爬取酷狗音乐（python爬取

hbuilder怎么更改字体大小和颜色

直播平台api接口 - 构建卓越的直播平台

实时股票数据api接口（股票实时行情api接口）