Introduction
The fs2-data-xml
module provides tools to parse XML data in a streaming manner.
Basic usage
To create a stream of XML events from an input stream, use the events
pipe in fs2.data.xml
package.
import cats.effect._
import cats.effect.unsafe.implicits.global
import fs2._
import fs2.data.xml._
val input = """<a xmlns:ns="http://test.ns">
| <ns:b ns:a="attribute">text</ns:b>
|</a>
|<a>
| <b/>
| test entity resolution & normalization
|</a>""".stripMargin
// input: String = """<a xmlns:ns="http://test.ns">
// <ns:b ns:a="attribute">text</ns:b>
// </a>
// <a>
// <b/>
// test entity resolution & normalization
// </a>"""
val stream = Stream.emit(input).through(events[IO, String]())
// stream: Stream[[A]IO[A], XmlEvent] = Stream(..)
stream.compile.toList.unsafeRunSync()
// res0: List[XmlEvent] = List(
// StartDocument,
// StartTag(
// name = QName(prefix = None, local = "a"),
// attributes = List(
// Attr(
// name = QName(prefix = Some(value = "xmlns"), local = "ns"),
// value = List(XmlString(s = "http://test.ns", isCDATA = false))
// )
// ),
// isEmpty = false
// ),
// XmlString(
// s = """
// """,
// isCDATA = false
// ),
// StartTag(
// name = QName(prefix = Some(value = "ns"), local = "b"),
// attributes = List(
// Attr(
// name = QName(prefix = Some(value = "ns"), local = "a"),
// value = List(XmlString(s = "attribute", isCDATA = false))
// )
// ),
// isEmpty = false
// ),
// XmlString(s = "text", isCDATA = false),
// EndTag(name = QName(prefix = Some(value = "ns"), local = "b")),
// XmlString(
// s = """
// """,
// isCDATA = false
// ),
// EndTag(name = QName(prefix = None, local = "a")),
// EndDocument,
// StartDocument,
// StartTag(
// name = QName(prefix = None, local = "a"),
// attributes = List(),
// isEmpty = false
// ),
// XmlString(
// s = """
// """,
// isCDATA = false
// ),
// StartTag(
// name = QName(prefix = None, local = "b"),
// attributes = List(),
// isEmpty = true
// ),
// EndTag(name = QName(prefix = None, local = "b")),
// XmlString(
// s = """
// test entity resolution """,
// isCDATA = false
// ),
// XmlEntityRef(name = "amp"),
// XmlString(
// s = """ normalization
// """,
// isCDATA = false
// ),
// EndTag(name = QName(prefix = None, local = "a")),
// EndDocument
// )
The pipe validates the XML structure while parsing. It reads all the XML elements in the input stream and emits events as they are available.
Resolvers
Namespace can be resolved by using the namespaceResolver
pipe.
val nsResolved = stream.through(namespaceResolver[IO])
// nsResolved: Stream[[x]IO[x], XmlEvent] = Stream(..)
nsResolved.compile.toList.unsafeRunSync()
// res1: List[XmlEvent] = List(
// StartDocument,
// StartTag(
// name = QName(
// prefix = Some(value = "http://www.w3.org/XML/1998/namespace"),
// local = "a"
// ),
// attributes = List(
// Attr(
// name = QName(prefix = Some(value = "xmlns"), local = "ns"),
// value = List(XmlString(s = "http://test.ns", isCDATA = false))
// )
// ),
// isEmpty = false
// ),
// XmlString(
// s = """
// """,
// isCDATA = false
// ),
// StartTag(
// name = QName(prefix = Some(value = "http://test.ns"), local = "b"),
// attributes = List(
// Attr(
// name = QName(prefix = Some(value = "http://test.ns"), local = "a"),
// value = List(XmlString(s = "attribute", isCDATA = false))
// )
// ),
// isEmpty = false
// ),
// XmlString(s = "text", isCDATA = false),
// EndTag(name = QName(prefix = Some(value = "http://test.ns"), local = "b")),
// XmlString(
// s = """
// """,
// isCDATA = false
// ),
// EndTag(
// name = QName(
// prefix = Some(value = "http://www.w3.org/XML/1998/namespace"),
// local = "a"
// )
// ),
// EndDocument,
// StartDocument,
// StartTag(
// name = QName(
// prefix = Some(value = "http://www.w3.org/XML/1998/namespace"),
// local = "a"
// ),
// attributes = List(),
// isEmpty = false
// ),
// XmlString(
// s = """
// """,
// isCDATA = false
// ),
// StartTag(
// name = QName(
// prefix = Some(value = "http://www.w3.org/XML/1998/namespace"),
// local = "b"
// ),
// attributes = List(),
// isEmpty = true
// ),
// EndTag(
// name = QName(
// prefix = Some(value = "http://www.w3.org/XML/1998/namespace"),
// local = "b"
// )
// ),
// XmlString(
// s = """
// test entity resolution """,
// isCDATA = false
// ),
// XmlEntityRef(name = "amp"),
// XmlString(
// s = """ normalization
// """,
// isCDATA = false
// ),
// EndTag(
// name = QName(
// prefix = Some(value = "http://www.w3.org/XML/1998/namespace"),
// local = "a"
// )
// ),
// EndDocument
// )
Using the referenceResolver
pipe, entity and character references can be resolved. By defaut the standard xmlEntities
mapping is used, but it can be replaced by any mapping you see fit.
val entityResolved = stream.through(referenceResolver[IO]())
// entityResolved: Stream[[x]IO[x], XmlEvent] = Stream(..)
entityResolved.compile.toList.unsafeRunSync()
// res2: List[XmlEvent] = List(
// StartDocument,
// StartTag(
// name = QName(prefix = None, local = "a"),
// attributes = List(
// Attr(
// name = QName(prefix = Some(value = "xmlns"), local = "ns"),
// value = List(XmlString(s = "http://test.ns", isCDATA = false))
// )
// ),
// isEmpty = false
// ),
// XmlString(
// s = """
// """,
// isCDATA = false
// ),
// StartTag(
// name = QName(prefix = Some(value = "ns"), local = "b"),
// attributes = List(
// Attr(
// name = QName(prefix = Some(value = "ns"), local = "a"),
// value = List(XmlString(s = "attribute", isCDATA = false))
// )
// ),
// isEmpty = false
// ),
// XmlString(s = "text", isCDATA = false),
// EndTag(name = QName(prefix = Some(value = "ns"), local = "b")),
// XmlString(
// s = """
// """,
// isCDATA = false
// ),
// EndTag(name = QName(prefix = None, local = "a")),
// EndDocument,
// StartDocument,
// StartTag(
// name = QName(prefix = None, local = "a"),
// attributes = List(),
// isEmpty = false
// ),
// XmlString(
// s = """
// """,
// isCDATA = false
// ),
// StartTag(
// name = QName(prefix = None, local = "b"),
// attributes = List(),
// isEmpty = true
// ),
// EndTag(name = QName(prefix = None, local = "b")),
// XmlString(
// s = """
// test entity resolution """,
// isCDATA = false
// ),
// XmlString(s = "&", isCDATA = false),
// XmlString(
// s = """ normalization
// """,
// isCDATA = false
// ),
// EndTag(name = QName(prefix = None, local = "a")),
// EndDocument
// )
Normalization
Once entites and namespaces are resolved, the events might be numerous and can be normalized to avoid emitting too many of them. For instance, after reference resolution, consecutive text events can be merged. This is achieved by using the normalize
pipe.
val normalized = entityResolved.through(normalize)
// normalized: Stream[[x]IO[x], XmlEvent] = Stream(..)
normalized.compile.toList.unsafeRunSync()
// res3: List[XmlEvent] = List(
// StartDocument,
// StartTag(
// name = QName(prefix = None, local = "a"),
// attributes = List(
// Attr(
// name = QName(prefix = Some(value = "xmlns"), local = "ns"),
// value = List(XmlString(s = "http://test.ns", isCDATA = false))
// )
// ),
// isEmpty = false
// ),
// XmlString(
// s = """
// """,
// isCDATA = false
// ),
// StartTag(
// name = QName(prefix = Some(value = "ns"), local = "b"),
// attributes = List(
// Attr(
// name = QName(prefix = Some(value = "ns"), local = "a"),
// value = List(XmlString(s = "attribute", isCDATA = false))
// )
// ),
// isEmpty = false
// ),
// XmlString(s = "text", isCDATA = false),
// EndTag(name = QName(prefix = Some(value = "ns"), local = "b")),
// XmlString(
// s = """
// """,
// isCDATA = false
// ),
// EndTag(name = QName(prefix = None, local = "a")),
// EndDocument,
// StartDocument,
// StartTag(
// name = QName(prefix = None, local = "a"),
// attributes = List(),
// isEmpty = false
// ),
// XmlString(
// s = """
// """,
// isCDATA = false
// ),
// StartTag(
// name = QName(prefix = None, local = "b"),
// attributes = List(),
// isEmpty = true
// ),
// EndTag(name = QName(prefix = None, local = "b")),
// XmlString(
// s = """
// test entity resolution & normalization
// """,
// isCDATA = false
// ),
// EndTag(name = QName(prefix = None, local = "a")),
// EndDocument
// )
DOM builder and eventifier
To handle XML DOM, you can use the types and pipes available in the fs2.data.xml.dom
package.
XML DOM can be built if you provide an implicit DocumentBuilder to the documents
pipe. The DocumentBuilder[Doc]
typeclass describes how XML DOM of type Doc
are built from an XML event stream.
import dom._
trait SomeDocType
implicit val builder: DocumentBuilder[SomeDocType] = ???
stream.through(documents[IO, SomeDocType])
Conversely, the pipe transforming a stream of Doc
s into a stream of XML events is called eventify
and requires an implicit DocumentEventifier in scope.
import dom._
trait SomeDocType
implicit val builder: DocumentBuilder[SomeDocType] = ???
implicit val eventifier: DocumentEventifier[SomeDocType] = ???
stream.through(documents[IO, SomeDocType])
.through(eventify[IO, SomeDocType])
XML Renderers
Once you got an XML event stream, selected and transformed what you needed in it, you can then write the resulting event stream to some storage. This can be achieved using renderers.
For instance, let's say you want to write the resulting XML stream to a file in raw form (i.e. without trying to format the nested tags and text), you can do:
import fs2.io.file.{Files, Flags, Path}
stream
.through(render.raw())
.through(text.utf8.encode)
.through(Files[IO].writeAll(Path("/some/path/to/file.xml"), Flags.Write))
.compile
.drain
There exists also a pretty()
renderer, that indents inner tags and text by the given indent string.
If you are interested in the String rendering as a value, the library also provides Collector
s:
stream.compile.to(collector.raw()).unsafeRunSync()
// res7: String = """<a xmlns:ns="http://test.ns">
// <ns:b ns:a="attribute">text</ns:b>
// </a><a>
// <b/>
// test entity resolution & normalization
// </a>"""
stream.compile.to(collector.pretty()).unsafeRunSync()
// res8: String = """<a xmlns:ns="http://test.ns">
// <ns:b ns:a="attribute">
// text
// </ns:b>
// </a>
// <a>
// <b />
// test entity resolution & normalization
// </a>"""