Я предполагаю, что BeautifulSoup.prettify добавляет эти лишние / ненужные переводы строк по умолчанию, и, похоже, нет хорошего способа изменить это поведение.
ДА
Это делается двумя способами: bs4.Tag
класс decode
и decode_contents
.
Ссылка: Исходный файл на github
Если вам просто нужно временное исправление, вы можете обезьяньим патчем эти два метода
Вот моя реализация
from bs4 import Tag, NavigableString, BeautifulSoup
from bs4.element import AttributeValueWithCharsetSubstitution, EntitySubstitution
def decode(
self, indent_level=None,
eventual_encoding="utf-8", formatter="minimal"
):
if not callable(formatter):
formatter = self._formatter_for_name(formatter)
attrs = []
if self.attrs:
for key, val in sorted(self.attrs.items()):
if val is None:
decoded = key
else:
if isinstance(val, list) or isinstance(val, tuple):
val = ' '.join(val)
elif not isinstance(val, str):
val = str(val)
elif (
isinstance(val, AttributeValueWithCharsetSubstitution)
and eventual_encoding is not None
):
val = val.encode(eventual_encoding)
text = self.format_string(val, formatter)
decoded = (
str(key) + '='
+ EntitySubstitution.quoted_attribute_value(text))
attrs.append(decoded)
close = ''
closeTag = ''
prefix = ''
if self.prefix:
prefix = self.prefix + ":"
if self.is_empty_element:
close = '/'
else:
closeTag = '</%s%s>' % (prefix, self.name)
pretty_print = self._should_pretty_print(indent_level)
space = ''
indent_space = ''
if indent_level is not None:
indent_space = (' ' * (indent_level - 1))
if pretty_print:
space = indent_space
indent_contents = indent_level + 1
else:
indent_contents = None
contents = self.decode_contents(
indent_contents, eventual_encoding, formatter)
if self.hidden:
# This is the 'document root' object.
s = contents
else:
s = []
attribute_string = ''
if attrs:
attribute_string = ' ' + ' '.join(attrs)
if indent_level is not None:
# Even if this particular tag is not pretty-printed,
# we should indent up to the start of the tag.
s.append(indent_space)
s.append('<%s%s%s%s>' % (
prefix, self.name, attribute_string, close))
has_tag_child = False
if pretty_print:
for item in self.children:
if isinstance(item, Tag):
has_tag_child = True
break
if has_tag_child:
s.append("\n")
s.append(contents)
if not has_tag_child:
s[-1] = s[-1].strip()
if pretty_print and contents and contents[-1] != "\n":
s.append("")
if pretty_print and closeTag:
if has_tag_child:
s.append(space)
s.append(closeTag)
if indent_level is not None and closeTag and self.next_sibling:
# Even if this particular tag is not pretty-printed,
# we're now done with the tag, and we should add a
# newline if appropriate.
s.append("\n")
s = ''.join(s)
return s
def decode_contents(
self,
indent_level=None,
eventual_encoding="utf-8",
formatter="minimal"
):
# First off, turn a string formatter into a function. This
# will stop the lookup from happening over and over again.
if not callable(formatter):
formatter = self._formatter_for_name(formatter)
pretty_print = (indent_level is not None)
s = []
for c in self:
text = None
if isinstance(c, NavigableString):
text = c.output_ready(formatter)
elif isinstance(c, Tag):
s.append(
c.decode(indent_level, eventual_encoding, formatter)
)
if text and indent_level and not self.name == 'pre':
text = text.strip()
if text:
if pretty_print and not self.name == 'pre':
s.append(" " * (indent_level - 1))
s.append(text)
if pretty_print and not self.name == 'pre':
s.append("")
return ''.join(s)
Tag.decode = decode
Tag.decode_contents= decode_contents
После этого, когда я сделал print(soup.prettify)
, вывод был
<annotation>
<folder>Definitiva</folder>
<filename>armas_229.jpg</filename>
<path>/tmp/tmpygedczp5/handgun/images/armas_229.jpg</path>
<size>
<width>1800</width>
<height>1426</height>
<depth>3</depth>
</size>
<segmented>0</segmented>
<object>
<name>handgun</name>
<pose>Unspecified</pose>
<truncated>0</truncated>
<difficult>0</difficult>
<bndbox>
<xmin>1001</xmin>
<ymin>549</ymin>
<xmax>1453</xmax>
<ymax>1147</ymax>
</bndbox>
</object>
</annotation>
Я сделал много предположений, делая это. Просто хотел показать, что это возможно.