#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
namespace newsbeuter {
htmlrenderer::htmlrenderer(unsigned int width, bool raw) : w(width), raw_(raw) {
tags["a"] = TAG_A;
tags["embed"] = TAG_EMBED;
tags["br"] = TAG_BR;
tags["pre"] = TAG_PRE;
tags["ituneshack"] = TAG_ITUNESHACK;
tags["img"] = TAG_IMG;
tags["blockquote"] = TAG_BLOCKQUOTE;
tags["aside"] = TAG_BLOCKQUOTE;
tags["p"] = TAG_P;
tags["h1"] = TAG_H1;
tags["h2"] = TAG_H2;
tags["h3"] = TAG_H3;
tags["h4"] = TAG_H4;
tags["ol"] = TAG_OL;
tags["ul"] = TAG_UL;
tags["li"] = TAG_LI;
tags["dt"] = TAG_DT;
tags["dd"] = TAG_DD;
tags["dl"] = TAG_DL;
tags["sup"] = TAG_SUP;
tags["sub"] = TAG_SUB;
tags["hr"] = TAG_HR;
tags["b"] = TAG_STRONG;
tags["strong"] = TAG_STRONG;
tags["u"] = TAG_UNDERLINE;
tags["q"] = TAG_QUOTATION;
tags["script"] = TAG_SCRIPT;
tags["style"] = TAG_STYLE;
tags["table"] = TAG_TABLE;
tags["th"] = TAG_TH;
tags["tr"] = TAG_TR;
tags["td"] = TAG_TD;
}
void htmlrenderer::render(const std::string& source, std::vector& lines, std::vector& links, const std::string& url) {
std::istringstream input(source);
render(input, lines, links, url);
}
unsigned int htmlrenderer::add_link(std::vector& links, const std::string& link, link_type type) {
bool found = false;
unsigned int i=1;
for (auto l : links) {
if (l.first == link) {
found = true;
break;
}
i++;
}
if (!found)
links.push_back(linkpair(link,type));
return i;
}
void htmlrenderer::render(std::istream& input, std::vector& lines, std::vector& links, const std::string& url) {
unsigned int image_count = 0;
std::string curline;
int indent_level = 0;
bool inside_li = false, is_ol = false, inside_pre = false;
bool itunes_hack = false;
size_t inside_script = 0;
size_t inside_style = 0;
std::vector ol_counts;
std::vector ol_types;
htmltag current_tag;
int link_num = -1;
std::vector tables;
/*
* to render the HTML, we use a self-developed "XML" pull parser.
*
* A pull parser works like this:
* - we feed it with an XML stream
* - we then gather an iterator
* - we then can iterate over all continuous elements, such as start tag, close tag, text element, ...
*/
tagsouppullparser xpp;
xpp.setInput(input);
for (tagsouppullparser::event e = xpp.next(); e != tagsouppullparser::END_DOCUMENT; e = xpp.next()) {
std::string tagname;
switch (e) {
case tagsouppullparser::START_TAG:
tagname = xpp.getText();
std::transform(tagname.begin(), tagname.end(), tagname.begin(), ::tolower);
current_tag = tags[tagname];
switch (current_tag) {
case TAG_A: {
std::string link;
try {
link = xpp.getAttributeValue("href");
} catch (const std::invalid_argument& ) {
LOG(LOG_WARN,"htmlrenderer::render: found a tag with no href attribute");
link = "";
}
if (link.length() > 0) {
link_num = add_link(links,utils::censor_url(utils::absolute_url(url,link)), LINK_HREF);
if (!raw_)
curline.append("");
}
}
break;
case TAG_STRONG:
if (!raw_)
curline.append("");
break;
case TAG_UNDERLINE:
if (!raw_)
curline.append("");
break;
case TAG_QUOTATION:
if (!raw_)
curline.append("\"");
break;
case TAG_EMBED: {
std::string type;
try {
type = xpp.getAttributeValue("type");
} catch (const std::invalid_argument& ) {
LOG(LOG_WARN, "htmlrenderer::render: found embed object without type attribute");
type = "";
}
if (type == "application/x-shockwave-flash") {
std::string link;
try {
link = xpp.getAttributeValue("src");
} catch (const std::invalid_argument& ) {
LOG(LOG_WARN, "htmlrenderer::render: found embed object without src attribute");
link = "";
}
if (link.length() > 0) {
link_num = add_link(links,utils::censor_url(utils::absolute_url(url,link)), LINK_EMBED);
curline.append(utils::strprintf("[%s %u]", _("embedded flash:"), link_num));
}
}
}
break;
case TAG_BR:
add_line(curline, tables, lines);
prepare_newline(curline, tables.size() ? 0 : indent_level);
break;
case TAG_PRE:
inside_pre = true;
add_nonempty_line(curline, tables, lines);
prepare_newline(curline, tables.size() ? 0 : indent_level);
break;
case TAG_ITUNESHACK:
itunes_hack = true;
break;
case TAG_IMG: {
std::string imgurl;
std::string imgtitle;
try {
imgurl = xpp.getAttributeValue("src");
} catch (const std::invalid_argument& ) {
LOG(LOG_WARN,"htmlrenderer::render: found img tag with no src attribute");
imgurl = "";
}
try {
imgtitle = xpp.getAttributeValue("title");
} catch (const std::invalid_argument& ) {
imgtitle = "";
}
if (imgurl.length() > 0) {
if (imgurl.substr(0,5) == "data:") {
link_num = add_link(links, "inline image", LINK_IMG);
} else {
link_num = add_link(links,utils::censor_url(utils::absolute_url(url,imgurl)), LINK_IMG);
}
if (imgtitle != "") {
curline.append(utils::strprintf("[%s %u: %s]", _("image"), link_num, imgtitle.c_str()));
} else {
curline.append(utils::strprintf("[%s %u]", _("image"), link_num));
}
image_count++;
}
}
break;
case TAG_BLOCKQUOTE:
++indent_level;
add_nonempty_line(curline, tables, lines);
add_line("", tables, lines);
prepare_newline(curline, tables.size() ? 0 : indent_level);
break;
case TAG_H1:
case TAG_H2:
case TAG_H3:
case TAG_H4:
case TAG_P:
add_nonempty_line(curline, tables, lines);
if (lines.size() > 0 && lines[lines.size()-1].length() > static_cast(indent_level*2))
add_line("", tables, lines);
prepare_newline(curline, tables.size() ? 0 : indent_level);
break;
case TAG_OL:
is_ol = true;
{
unsigned int ol_count = 1;
try {
std::string ol_count_str = xpp.getAttributeValue("start");
std::istringstream is(ol_count_str);
is >> ol_count;
} catch (const std::invalid_argument& ) {
ol_count = 1;
}
ol_counts.push_back(ol_count);
std::string ol_type;
try {
ol_type = xpp.getAttributeValue("type");
if (ol_type != "1" && ol_type != "a" && ol_type != "A" && ol_type != "i" && ol_type != "I") {
ol_type = "1";
}
} catch (const std::invalid_argument& ) {
ol_type = "1";
}
ol_types.push_back(ol_type[0]);
}
add_nonempty_line(curline, tables, lines);
add_line("", tables, lines);
prepare_newline(curline, tables.size() ? 0 : indent_level);
break;
case TAG_UL:
is_ol = false;
add_nonempty_line(curline, tables, lines);
add_line("", tables, lines);
prepare_newline(curline, tables.size() ? 0 : indent_level);
break;
case TAG_LI:
if (inside_li) {
indent_level-=2;
if (indent_level < 0) indent_level = 0;
add_nonempty_line(curline, tables, lines);
prepare_newline(curline, tables.size() ? 0 : indent_level);
}
inside_li = true;
add_nonempty_line(curline, tables, lines);
prepare_newline(curline, tables.size() ? 0 : indent_level);
indent_level+=2;
if (is_ol && ol_counts.size() != 0) {
curline.append(utils::strprintf("%s.", format_ol_count(ol_counts[ol_counts.size()-1], ol_types[ol_types.size()-1]).c_str()));
++ol_counts[ol_counts.size()-1];
} else {
curline.append(" * ");
}
break;
case TAG_DT:
add_nonempty_line(curline, tables, lines);
prepare_newline(curline, tables.size() ? 0 : indent_level);
break;
case TAG_DD:
indent_level+=4;
add_nonempty_line(curline, tables, lines);
prepare_newline(curline, tables.size() ? 0 : indent_level);
break;
case TAG_DL:
// ignore tag
break;
case TAG_SUP:
curline.append("^");
break;
case TAG_SUB:
curline.append("[");
break;
case TAG_HR:
add_nonempty_line(curline, tables, lines);
prepare_newline(curline, tables.size() ? 0 : indent_level);
add_line(std::string(" ") + std::string(w - 2, '-') + std::string(" "), tables, lines);
prepare_newline(curline, tables.size() ? 0 : indent_level);
break;
case TAG_SCRIPT:
add_nonempty_line(curline, tables, lines);
prepare_newline(curline, tables.size() ? 0 : indent_level);
// don't render scripts, ignore current line
inside_script++;
break;
case TAG_STYLE:
inside_style++;
break;
case TAG_TABLE: {
add_nonempty_line(curline, tables, lines);
prepare_newline(curline, 0); // no indent in tables
bool border = false;
try {
std::string b = xpp.getAttributeValue("border");
border = (utils::to_u(b) > 0);
} catch (const std::invalid_argument& ) {
// is ok, no border than
}
tables.push_back(Table(border));
break;
}
case TAG_TR:
if (!tables.empty())
tables.back().start_row();
break;
case TAG_TH: {
size_t span = 1;
try {
span = utils::to_u(xpp.getAttributeValue("colspan"));
} catch (const std::invalid_argument& ) {
// is ok, span 1 than
}
if (!tables.empty())
tables.back().start_cell(span);
curline.append("");
break;
}
case TAG_TD: {
size_t span = 1;
try {
span = utils::to_u(xpp.getAttributeValue("colspan"));
} catch (const std::invalid_argument& ) {
// is ok, span 1 than
}
if (!tables.empty())
tables.back().start_cell(span);
break;
}
}
break;
case tagsouppullparser::END_TAG:
tagname = xpp.getText();
std::transform(tagname.begin(), tagname.end(), tagname.begin(), ::tolower);
current_tag = tags[tagname];
switch (current_tag) {
case TAG_BLOCKQUOTE:
--indent_level;
if (indent_level < 0) indent_level = 0;
add_nonempty_line(curline, tables, lines);
add_line("", tables, lines);
prepare_newline(curline, tables.size() ? 0 : indent_level);
break;
case TAG_OL:
ol_types.pop_back();
ol_counts.pop_back();
// fall-through
case TAG_UL:
if (inside_li) {
indent_level-=2;
if (indent_level < 0) indent_level = 0;
add_nonempty_line(curline, tables, lines);
prepare_newline(curline, tables.size() ? 0 : indent_level);
}
add_nonempty_line(curline, tables, lines);
add_line("", tables, lines);
prepare_newline(curline, tables.size() ? 0 : indent_level);
break;
case TAG_DT:
add_nonempty_line(curline, tables, lines);
add_line("", tables, lines);
prepare_newline(curline, tables.size() ? 0 : indent_level);
break;
case TAG_DD:
indent_level-=4;
if (indent_level < 0) indent_level = 0;
add_nonempty_line(curline, tables, lines);
add_line("", tables, lines);
prepare_newline(curline, tables.size() ? 0 : indent_level);
break;
case TAG_DL:
// ignore tag
break;
case TAG_LI:
indent_level-=2;
if (indent_level < 0) indent_level = 0;
inside_li = false;
add_nonempty_line(curline, tables, lines);
prepare_newline(curline, tables.size() ? 0 : indent_level);
break;
case TAG_H1:
if (line_is_nonempty(curline)) {
add_line(curline, tables, lines);
size_t llen = utils::strwidth_stfl(curline);
prepare_newline(curline, tables.size() ? 0 : indent_level);
add_line(std::string(llen, '-'), tables, lines);
}
prepare_newline(curline, tables.size() ? 0 : indent_level);
break;
case TAG_H2:
case TAG_H3:
case TAG_H4:
case TAG_P:
add_nonempty_line(curline, tables, lines);
prepare_newline(curline, tables.size() ? 0 : indent_level);
break;
case TAG_PRE:
add_nonempty_line(curline, tables, lines);
prepare_newline(curline, tables.size() ? 0 : indent_level);
inside_pre = false;
break;
case TAG_SUB:
curline.append("]");
break;
case TAG_SUP:
// has closing tag, but we render nothing.
break;
case TAG_A:
if (link_num != -1) {
if (!raw_)
curline.append(">");
curline.append(utils::strprintf("[%d]", link_num));
link_num = -1;
}
break;
case TAG_UNDERLINE:
if (!raw_)
curline.append(">");
break;
case TAG_STRONG:
if (!raw_)
curline.append(">");
break;
case TAG_QUOTATION:
if (!raw_)
curline.append("\"");
break;
case TAG_EMBED:
case TAG_BR:
case TAG_ITUNESHACK:
case TAG_IMG:
case TAG_HR:
// ignore closing tags
break;
case TAG_SCRIPT:
// don't render scripts, ignore current line
if (inside_script)
inside_script--;
prepare_newline(curline, tables.size() ? 0 : indent_level);
break;
case TAG_STYLE:
if (inside_style)
inside_style--;
break;
case TAG_TABLE:
add_nonempty_line(curline, tables, lines);
prepare_newline(curline, 0); // no indent in tables
if (!tables.empty()) {
std::vector table_text;
tables.back().complete_cell();
tables.back().complete_row();
render_table(tables.back(), table_text);
tables.pop_back();
if (!tables.empty()) { // still a table on the outside?
for(size_t idx=0; idx < table_text.size(); ++idx)
tables.back().add_text(table_text[idx]); // add rendered table to current cell
} else {
for(size_t idx=0; idx < table_text.size(); ++idx) {
std::string s = table_text[idx];
while (s.length() > 0 && s[0] == '\n')
s.erase(0, 1);
add_line(s, tables, lines);
}
}
}
prepare_newline(curline, tables.size() ? 0: indent_level);
break;
case TAG_TR:
add_nonempty_line(curline, tables, lines);
prepare_newline(curline, 0); // no indent in tables
if (!tables.empty())
tables.back().complete_row();
break;
case TAG_TH:
if (!tables.empty()) {
curline.append(">");
}
add_nonempty_line(curline, tables, lines);
prepare_newline(curline, 0); // no indent in tables
if (!tables.empty()) {
tables.back().complete_cell();
}
break;
case TAG_TD:
add_nonempty_line(curline, tables, lines);
prepare_newline(curline, 0); // no indent in tables
if (!tables.empty())
tables.back().complete_cell();
break;
}
break;
case tagsouppullparser::TEXT: {
if (itunes_hack) {
std::vector words = utils::tokenize_nl(utils::quote_for_stfl(xpp.getText()));
for (auto word : words) {
if (word == "\n") {
add_line(curline, tables, lines);
prepare_newline(curline, tables.size() ? 0 : indent_level);
} else {
std::vector words2 = utils::tokenize_spaced(word);
unsigned int i=0;
bool new_line = false;
for (auto word2 : words2) {
if ((utils::strwidth_stfl(curline) + utils::strwidth_stfl(word2)) >= w) {
add_nonempty_line(curline, tables, lines);
prepare_newline(curline, tables.size() ? 0 : indent_level);
new_line = true;
}
if (new_line) {
if (word2 != " ")
curline.append(word2);
new_line = false;
} else {
curline.append(word2);
}
i++;
}
}
}
} else if (inside_pre) {
std::vector words = utils::tokenize_nl(utils::quote_for_stfl(xpp.getText()));
for (auto word : words) {
if (word == "\n") {
add_line(curline, tables, lines);
prepare_newline(curline, tables.size() ? 0 : indent_level);
} else {
curline.append(word);
}
}
} else if (inside_script || inside_style) {
// skip scripts and CSS styles
} else {
std::string s = utils::quote_for_stfl(xpp.getText());
while (s.length() > 0 && s[0] == '\n')
s.erase(0, 1);
std::vector words = utils::tokenize_spaced(s);
bool new_line = false;
if (!line_is_nonempty(curline) && !words.empty() && words[0] == " ") {
words.erase(words.begin());
}
for (auto word : words) {
if ((utils::strwidth_stfl(curline) + utils::strwidth_stfl(word)) >= w) {
add_nonempty_line(curline, tables, lines);
prepare_newline(curline, tables.size() ? 0 : indent_level);
new_line = true;
}
if (new_line) {
if (word != " ")
curline.append(word);
new_line = false;
} else {
curline.append(word);
}
}
}
}
break;
default:
/* do nothing */
break;
}
}
// and the rest
add_nonempty_line(curline, tables, lines);
// force all tables to be closed and rendered
while(!tables.empty()) {
std::vector table_text;
render_table(tables.back(), table_text);
tables.pop_back();
for(size_t idx=0; idx < table_text.size(); ++idx) {
std::string s = table_text[idx];
while (s.length() > 0 && s[0] == '\n')
s.erase(0, 1);
add_line(s, tables, lines);
}
}
// add link list
if (links.size() > 0) {
lines.push_back("");
lines.push_back(_("Links: "));
for (unsigned int i=0; i& tables, std::vector& lines) {
if (line_is_nonempty(curline))
add_line(curline, tables, lines);
}
void htmlrenderer::add_line(const std::string& curline, std::vector& tables, std::vector& lines) {
if (tables.size())
tables.back().add_text(curline);
else
lines.push_back(curline);
}
void htmlrenderer::prepare_newline(std::string& line, int indent_level) {
line = "";
line.append(indent_level*2, ' ');
}
bool htmlrenderer::line_is_nonempty(const std::string& line) {
for (unsigned int i=0; i& lines) {
// get number of rows
size_t rows = table.rows.size();
// get maximum number of cells
size_t cells = 0;
for(size_t row=0; row < rows; row++) {
size_t count = 0;
for(size_t cell=0; cell < table.rows[row].cells.size(); cell++) {
count += table.rows[row].cells[cell].span;
}
cells = std::max(cells, count);
}
// get width of each row
std::vector cell_widths;
cell_widths.resize(cells, 0);
for(size_t row=0; row < rows; row++) {
for(size_t cell=0; cell < table.rows[row].cells.size(); cell++) {
size_t width = 0;
if (table.rows[row].cells[cell].text.size()) {
for(size_t idx=0; idx < table.rows[row].cells[cell].text.size(); idx++)
width = std::max(width, utils::strwidth_stfl(table.rows[row].cells[cell].text[idx]));
}
if (table.rows[row].cells[cell].span > 1) {
width += table.rows[row].cells[cell].span;
width /= table.rows[row].cells[cell].span; // devide size evenly on columns (can be done better, I know)
}
cell_widths[cell] = std::max(cell_widths[cell], width);
}
}
char hsep = '-';
char vsep = '|';
char hvsep = '+';
// create a row separator
std::string separator;
if (table.border)
separator += hvsep;
for(size_t cell=0; cell < cells; cell++) {
separator += std::string(cell_widths[cell], hsep);
separator += hvsep;
}
if (!table.border)
vsep = ' ';
// render the table
if (table.border)
lines.push_back(separator);
for(size_t row=0; row < rows; row++) {
// calc height of this row
size_t height = 0;
for(size_t cell=0; cell < table.rows[row].cells.size(); cell++)
height = std::max(height, table.rows[row].cells[cell].text.size());
for(size_t idx=0; idx < height; ++idx) {
std::string line;
if (table.border)
line += vsep;
for(size_t cell=0; cell < table.rows[row].cells.size(); cell++) {
size_t cell_width = 0;
if (idx < table.rows[row].cells[cell].text.size()) {
LOG(LOG_DEBUG, "row = %d cell = %d text = %s", row, cell, table.rows[row].cells[cell].text[idx].c_str());
cell_width = utils::strwidth_stfl(table.rows[row].cells[cell].text[idx]);
line += table.rows[row].cells[cell].text[idx];
}
size_t reference_width = cell_widths[cell];
if (table.rows[row].cells[cell].span > 1) {
for(size_t ic=cell+1; ic < cell + table.rows[row].cells[cell].span; ++ic)
reference_width += cell_widths[ic]+1;
}
LOG(LOG_DEBUG, "cell_width = %d reference_width = %d", cell_width, reference_width);
if (cell_width < reference_width) // pad, if necessary
line += std::string(reference_width - cell_width, ' ');
if (cell < table.rows[row].cells.size()-1)
line += vsep;
}
if (table.border)
line += vsep;
lines.push_back(line);
}
if (table.border)
lines.push_back(separator);
}
}
std::string htmlrenderer::get_char_numbering(unsigned int count) {
std::string result;
do {
count--;
result.append(1, 'a'+(count % 26));
count /= 26;
} while (count > 0);
std::reverse(result.begin(), result.end());
return result;
}
std::string htmlrenderer::get_roman_numbering(unsigned int count) {
unsigned int values[] = { 1000, 900, 500, 400, 100, 90, 50, 40, 10, 9, 5, 4, 1 };
const char * numerals[] = { "m", "cm", "d", "cd", "c", "xc", "l", "xl", "x", "ix", "v", "iv", "i" };
std::string result;
for (unsigned int i=0; i<(sizeof(values)/sizeof(values[0])); i++) {
while (count >= values[i]) {
count -= values[i];
result.append(numerals[i]);
}
}
return result;
}
std::string htmlrenderer::format_ol_count(unsigned int count, char type) {
switch (type) {
case 'a':
return get_char_numbering(count);
case 'A': {
std::string num = get_char_numbering(count);
std::transform(num.begin(), num.end(), num.begin(), ::toupper);
return num;
}
case 'i':
return get_roman_numbering(count);
case 'I': {
std::string roman = get_roman_numbering(count);
std::transform(roman.begin(), roman.end(), roman.begin(), ::toupper);
return roman;
}
case '1':
default:
return utils::strprintf("%2u", count);
}
}
}