namespace qi = boost::spirit::qi;
qi::int_parser<unsigned char, 16, 2, 2> hex_byte;
struct UrlParser : qi::grammar<stringIterator_t, UrlStruct()>
{
UrlParser() : UrlParser::base_type(start)
{
using boost::spirit::int_;
namespace ascii = boost::spirit::ascii;
using boost::spirit::ascii::char_;
// Parses http://, rtsp://, ftp://....
protocol %= +(char_ - ':') >> "://";
// Parsers for url decoding
encodeChar.add("+", ' ');
passwordRule = *((encodeChar | "%" >> hex_byte | char_) - '@');
loginRule = *((encodeChar | "%" >> hex_byte | char_) - ':');
// Parses user:pass@, user:@, :pass@
credentials %= loginRule >> ':' >> passwordRule >> '@';
// Parses host name or ip address in 192.168.0.1:80 192.168.0.1/script.cgi
host %= +(!(char_("/") | (char_(":")) >> +ascii::digit >> (char_("/") | boost::spirit::eoi)) >> char_);
// Parses port number in ":80", string
port %= ':' >> int_;
// Parses script path in "/video.cgi?resulution=1" string.
path %= *(char_ - boost::spirit::eol);
start %= -protocol
>> -credentials
>> host
>> -port
>> path
;
}
qi::rule<stringIterator_t, UrlStruct()> start;
qi::rule<stringIterator_t, std::string()> protocol;
qi::rule<stringIterator_t, UrlStruct::stringPair_t()> credentials;
qi::rule<stringIterator_t, std::string()> host;
qi::rule<stringIterator_t, int()> port;
qi::rule<stringIterator_t, std::string()> path;
private:
qi::rule<stringIterator_t, std::string()> loginRule;
qi::rule<stringIterator_t, std::string()> passwordRule;
qi::symbols<char const, char const> encodeChar;
};
UrlStruct looks like this:
struct UrlStruct
{
typedef boost::optional<std::string> optString_t;
typedef boost::optional<int> port_t;
typedef boost::fusion::vector<std::string, std::string> stringPair_t;
typedef boost::optional<stringPair_t> credentials_t;
optString_t protocol;
credentials_t credentials;
std::string host;
port_t port;
std::string path;
};
and this is work when I have url like this:
rtsp://admin:admin@wowzaec2demo.streamlock.net/vod/mp4:BigBuckBunny_115k.mov
But when in my password I've '@' this is not working. URL for example:
rtsp://admin:adm@in@wowzaec2demo.streamlock.net/vod/mp4:BigBuckBunny_115k.mov
How I can resolve this problem?
The relatively obvious workaround would be to URL-escape the @
:
UrlParser p;
for (std::string const input : {
"rtsp://admin:admin@wowzaec2demo.streamlock.net/vod/mp4:BigBuckBunny_115k.mov",
"rtsp://admin:adm%40in@wowzaec2demo.streamlock.net/vod/mp4:BigBuckBunny_115k.mov",
"rtsp://admin:adm@in@wowzaec2demo.streamlock.net/vod/mp4:BigBuckBunny_115k.mov",
})
{
stringIterator_t f = begin(input), l = end(input);
Ast::UrlStruct u;
if (parse(f, l, p, u)) {
std::cout << "Parsed: " << u << "\n";
} else {
std::cout << "Failed\n";
}
Prints
Parsed: ( rtsp (admin admin) wowzaec2demo.streamlock.net -- /vod/mp4:BigBuckBunny_115k.mov)
Parsed: ( rtsp (admin adm@in) wowzaec2demo.streamlock.net -- /vod/mp4:BigBuckBunny_115k.mov)
Parsed: ( rtsp (admin adm) in@wowzaec2demo.streamlock.net -- /vod/mp4:BigBuckBunny_115k.mov)
As you can see the second (using %40
for @
) parsed fine.
Your host-rule already has some negative look-ahead assertions:
host = +(
!(char_("/") | (char_(":")) >> +ascii::digit >> (char_("/") | boost::spirit::eoi))
>> char_);
I'd simplify this to the equivalent:
host = +(~char_("/") - (port >> ('/' | qi::eoi)));
As an aside, I think the ":nnnn" is not a port unless it is at the very end of the hostname might be wrong and potentially unsafe. I suppose you have it there so you can still accept IPV6 addresses? See also What is the nicest way to parse this in C++?
Similarly your password
rule has a subtractive parser:
password = *((encodeChar | "%" >> hex_byte | char_) - '@');
What you want is close; you don't want to disallow '@'
per se, but only if followed by a valid host production:
login = *((encodeChar | "%" >> hex_byte | char_) - ':');
password = *((encodeChar | "%" >> hex_byte | char_) -
('@' >> host >> eoh));
I introduced a nifty short-hand for end-of-host
:
auto eoh = qi::copy('/' | qi::eoi);
This is to remove the repetition with the host
rule. Note that now we finish the job by making sure @
doesn't occur in the host-part (there are likely more characters to exclude according to the RFC, but this is the minimum to make your tests pass):
host = +(~char_("@/") - (portSpec >> eoh));
//#define BOOST_SPIRIT_DEBUG
#include <boost/fusion/adapted.hpp>
#include <boost/fusion/include/io.hpp>
#include <boost/optional/optional_io.hpp>
#include <boost/spirit/include/qi.hpp>
#include <iomanip>
namespace qi = boost::spirit::qi;
namespace Ast {
struct UrlStruct {
using optString_t = boost::optional<std::string>;
using port_t = boost::optional<int>;
struct Credentials { // more natural, for ease of debug output really
std::string login, password;
};
using credentials_t = boost::optional<Credentials>;
optString_t protocol;
credentials_t credentials;
std::string host;
port_t port;
std::string path;
};
using boost::fusion::operator<<;
} // namespace Ast
BOOST_FUSION_ADAPT_STRUCT(Ast::UrlStruct::Credentials, login, password)
BOOST_FUSION_ADAPT_STRUCT(Ast::UrlStruct, protocol, credentials, host, port, path)
using stringIterator_t = std::string::const_iterator;
struct UrlParser : qi::grammar<stringIterator_t, Ast::UrlStruct()>
{
UrlParser() : UrlParser::base_type(start)
{
namespace ascii = boost::spirit::ascii;
using boost::spirit::ascii::char_;
// Parses http://, rtsp://, ftp://....
protocol = +(char_ - ':') >> "://";
// Parsers for url decoding
encodeChar.add("+", ' ');
auto eoh = qi::copy('/' | qi::eoi);
login = *((encodeChar | "%" >> hex_byte | char_) - ':');
password = *((encodeChar | "%" >> hex_byte | char_) -
('@' >> host >> eoh));
// Parses user:pass@, user:@, :pass@
credentials = login >> ':' >> password >> '@';
// Parses host name or ip address in 192.168.0.1:80 192.168.0.1/script.cgi
host = +(~char_("@/") - (portSpec >> eoh));
// Parses port number in ":80", string
portSpec = ':' >> port_number;
// Parses script path in "/video.cgi?resulution=1" string.
path = *(char_ - qi::eol);
start = -protocol
>> -credentials
>> host
>> -portSpec
>> path
;
BOOST_SPIRIT_DEBUG_NODES((start)(protocol)(credentials)(host)(portSpec)(
path)(login)(password))
}
private:
qi::rule<stringIterator_t, Ast::UrlStruct()> start;
qi::rule<stringIterator_t, std::string()> protocol;
qi::rule<stringIterator_t, Ast::UrlStruct::Credentials()> credentials;
qi::rule<stringIterator_t, std::string()> host;
qi::rule<stringIterator_t, int()> portSpec;
qi::rule<stringIterator_t, std::string()> path;
qi::rule<stringIterator_t, std::string()> login;
qi::rule<stringIterator_t, std::string()> password;
qi::symbols<char const, char const> encodeChar;
qi::uint_parser<uint8_t, 16, 2, 2> hex_byte;
qi::uint_parser<uint16_t> port_number;
};
int main() {
UrlParser p;
for (std::string const input : {
"rtsp://admin:admin@wowzaec2demo.streamlock.net/vod/mp4:BigBuckBunny_115k.mov",
"rtsp://admin:adm%40in@wowzaec2demo.streamlock.net/vod/mp4:BigBuckBunny_115k.mov",
"rtsp://admin:adm@in@wowzaec2demo.streamlock.net/vod/mp4:BigBuckBunny_115k.mov",
})
{
stringIterator_t f = begin(input), l = end(input);
Ast::UrlStruct u;
if (parse(f, l, p, u)) {
std::cout << "Yay: " << u << "\n";
} else {
std::cout << "Nay\n";
}
if (f != l) {
std::cout << "Remaining: " << std::quoted(std::string(f, l)) << "\n";
}
}
}
Now prints:
Yay: ( rtsp (admin admin) wowzaec2demo.streamlock.net -- /vod/mp4:BigBuckBunny_115k.mov)
Yay: ( rtsp (admin adm@in) wowzaec2demo.streamlock.net -- /vod/mp4:BigBuckBunny_115k.mov)
Yay: ( rtsp (admin adm@in) wowzaec2demo.streamlock.net -- /vod/mp4:BigBuckBunny_115k.mov)
Port really should be limited to positive integers in the 16 bit range.
Use BOOST_SPIRIT_DEBUG to debug your grammar more easily
don't use the implementation details (fusion::vector). Instead just std::pair
, or indeed another adapted struct as I've done. That makes it much less painful to implement operator<<
for quick debugging.
You have a slight inconsistency regarding the end-of-input detection. In most places you use eoi
, but then in path
you accept until eol
. You probably want to make things more consistent or decide what any input starting from the eol
means.
Real life URI's sometimes contain login names but not the password (also without :@
). I think your grammar might want to support that?
A more structural deficiency with the parser right now is efficiency. All the negative look-aheads might lead to more back-tracking than desired. - (':' >> port)
should be fine, but - ('@' >> host)
might peek quite a few characters. Realistically, I donot think that should be a problem (unless of course you regularly have very long passwords with many ampersand characters).
But on a serious note I'd take that as a sign the parser is trying to do too many things at the same level of abstraction. Things would be much easier if you divide-and-conquer: schema://HOSTSTUFF/PATHSTUFF
first and then HOSTSTUFF
in the way I linked earlier. Same then goes for the path/query part because there's a myriad of interesting things there that you really don't need to bother with when at first separating the toplevel limbs of the URI.