This is a follow up from: How to extract data from HTML page source of (a tab within) a webpage?
We're currently extracting the tabular data available here from the Financials section of a company. The table of data from here, for example: https://finance.yahoo.com/quote/AAPL/financials?p=AAPL&guccounter=2
However, the response I get for my code has been an empty string. When I look at the Root.App.main section that we were previously extracting, it looks like a bunch of encryped strings. I am not sure if I am making a mistake in reading this. What's the best way to extract this on Java for Android?
Is there a better way to extract a specific value, for example, I want to extract 394,328,000, which is the Total Revenue on 9/30/2022. I'd preferably like to have the entire table data as a Map.
Here's my current code that may throw more light into how it's currently being done.
String requestURL = "https://finance.yahoo.com/quote/AAPL/financials?p=AAPL&guccounter=2";
String userAgent = "My UAString";
Document doc = Jsoup.connect(requestURL).userAgent(userAgent).get();
Elements scriptTags = doc.getElementsByTag("script");
String re = "root\\.App\\.main\\s*\\=\\s*(.*?);\\s*\\}\\(this\\)\\)\\s*;";
String data = null;
for (Element script : scriptTags) {
Pattern pattern = Pattern.compile(re, Pattern.DOTALL);
Matcher matcher = pattern.matcher(script.html());
if (matcher.find()) {
data = matcher.group(1);
break;
}
}
String requestURL = "https://finance.yahoo.com/quote/AAPL/financials?p=AAPL";
String userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36 OPR/56.0.3051.43";
String row = "totalRevenue";
try {
Document doc = Jsoup.connect(requestURL).userAgent(userAgent).get();
String html = doc.html();
//Log.d("html", html);
Elements scriptTags = doc.getElementsByTag("script");
String re = "root\\.App\\.main\\s*\\=\\s*(.*?);\\s*\\}\\(this\\)\\)\\s*;";
for (Element script : scriptTags) {
Pattern pattern = Pattern.compile(re, Pattern.DOTALL);
Matcher matcher = pattern.matcher(script.html());
if (matcher.find()) {
String data = matcher.group(1);
//Log.d("data", data);
JSONObject jo = new JSONObject(data);
JSONArray table = getTable(jo);
//Log.d("table", table.toString());
String[] tableRow = getRow(table, row);
String values = TextUtils.join(", ", tableRow);
Log.d("values", values);
}
}
} catch (Exception e) {
Log.e("err", "err", e);
}
private JSONArray getTable(JSONObject json) throws JSONException {
JSONArray table = (JSONArray) json.getJSONObject("context")
.getJSONObject("dispatcher")
.getJSONObject("stores")
.getJSONObject("QuoteSummaryStore")
.getJSONObject("incomeStatementHistoryQuarterly")
.getJSONArray("incomeStatementHistory");
return table;
}
private String[] getRow(JSONArray table, String name) throws JSONException {
String[] values = new String[table.length()];
for (int i = 0; i < table.length(); i++) {
JSONObject jo = table.getJSONObject(i);
if (jo.has(name)) {
jo = jo.getJSONObject(name);
values[i] = jo.has("longFmt") ? jo.get("longFmt").toString() : "-";
} else {
values[i] = "-";
}
}
return values;
}
private String[] getDates(JSONArray table) throws JSONException {
String[] values = new String[table.length()];
for (int i = 0; i < table.length(); i++) {
values[i] = table.getJSONObject(i).getJSONObject("endDate")
.get("fmt").toString();
}
return values;
}
Map<String, Map<String, String>> getTableNames() {
final Map<String, String> revenue = new LinkedHashMap<String, String>() {
{ put("Total Revenue", "totalRevenue"); }
{ put("Cost of Revenue", "costOfRevenue"); }
{ put("Gross Profit", "grossProfit"); }
};
final Map<String, String> operatingExpenses = new LinkedHashMap<String, String>() {
{ put("Research Development", "researchDevelopment"); }
{ put("Selling General and Administrative", "sellingGeneralAdministrative"); }
{ put("Non Recurring", "nonRecurring"); }
{ put("Others", "otherOperatingExpenses"); }
{ put("Total Operating Expenses", "totalOperatingExpenses"); }
{ put("Operating Income or Loss", "operatingIncome"); }
};
Map<String, Map<String, String>> allTableNames = new LinkedHashMap<String, Map<String, String>>() {
{ put("Revenue", revenue); }
{ put("Operating Expenses", operatingExpenses); }
};
return allTableNames;
}
JSONObject jo = new JSONObject(jsData);
JSONArray table = getTable(jo);
Map<String, Map<String, String>> tableNames = getTableNames();
String totalRevenueKey = tableNames.get("Revenue").get("Total Revenue");
String[] totalRevenueValues = getRow(table, totalRevenueKey);
String value = totalRevenueValues[0];
List<String> tableData = new ArrayList<>();
Map<String, Map<String, String>> tableNames = getTableNames();
String[] dates = getDates(table);
for (Map.Entry<String, Map<String, String>> tableEntry : tableNames.entrySet()) {
tableData.add(tableEntry.getKey());
tableData.addAll(Arrays.asList(dates));
for (Map.Entry<String, String> row : tableEntry.getValue().entrySet()) {
String[] tableRow = getRow(table, row.getValue());
tableData.add(row.getKey());
for (String column: tableRow) {
tableData.add(column);
}
}
}
String tableDataString = TextUtils.join(", ", tableData);