using Microsoft.AspNetCore.Http; using Microsoft.Extensions.Primitives; using Microsoft.Extensions.DependencyInjection; using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.Threading.Tasks; using Ganss.XSS; using System.IO; using Markdig; using Markdig.Syntax; using Blog3000.Shared; using Microsoft.Extensions.Logging; namespace Blog3000.Server.MiddleWares { public class SearchBotHandler { /// /// Default no SEO is allowed. /// Two-Folded strategy: Prohibit via static robots.txt and http headers /// On valid bots indexing may be allowed. The synthetics index + blosts are created, which /// match the uri-scheme of the blazor-viewer on the client /// /// allow valid bots like bing+google /// allow a dev-test-box 'xx-testbot-xx' /// private readonly string blogName; private readonly bool allowIndexing; private readonly bool allowDevBot; private readonly ILogger logger; private readonly SEOMgr seoMgr; public SearchBotHandler(ILogger logger, BlogEnv blogEnv, SEOMgr seoMgr) { this.logger = logger; this.seoMgr = seoMgr; this.blogName = blogEnv?.BlogConfig?.Title ?? "Blog3000 Instance"; this.allowIndexing = blogEnv.BlogConfig.AllowSearchBots; this.allowDevBot = blogEnv.BlogConfig.AllowSearchDevBot; } public async Task Process(HttpContext context) { string robotsTag = "noindex, nofollow"; StringValues uas; bool isAllowedRobot = false; bool allowForAll = false; bool res = false; // Allow passing the pipeline // Check if it an allowed robot first if (context.Request.Headers.TryGetValue("User-Agent", out uas)) { //logger.LogInformation($"User-Agent|{uas}"); foreach (var ua in uas) { var lua = ua?.ToLowerInvariant() ?? ""; if ((allowIndexing) && (lua.Contains("googlebot")) || ((allowIndexing) && (lua.Contains("bingbot"))) || ((allowDevBot) && (lua.Contains("xx-testbot-xx"))) ) { logger.LogInformation($"Robot detected|{context.Connection.RemoteIpAddress}|{lua}"); robotsTag = "noarchive, noimageindex, max-image-preview:none, max-snippet:50"; isAllowedRobot = true; } } } //if (context.Request.Path.ToString().Equals("/sitemap.xml")) //{ // logger.LogInformation($"Sitemap request|{context.Connection.RemoteIpAddress}|Allowing for all useragents, now: {uas}"); // allowForAll = true; //} if (!isAllowedRobot && !allowForAll) { //if ((allowIndexing || allowDevBot) && (context.Request.Path.ToString().Equals("/"))) //{ // logger.LogInformation($"Allow index for / for all UA |{context.Connection.RemoteIpAddress}|Useragent {uas}"); // res = false; //} //else if ((allowIndexing || allowDevBot) && (context.Request.Path.ToString().Equals("/robots.txt"))) //{ // logger.LogInformation($"Suppressing negative robots for all UA|{context.Connection.RemoteIpAddress}|Useragent {uas}"); // context.Response.StatusCode = 404; // res = true; //} //else { context.Response.Headers.Add("X-Robots-Tag", robotsTag); res = false; } } else { if (context.Request.Path.ToString().StartsWith("/robots.txt")) { logger.LogInformation($"Robot robots.txt output|{context.Connection.RemoteIpAddress}"); StringBuilder sb = new StringBuilder(); sb.AppendLine("User-Agent: *"); // Can be wildcard, since we create this only for allowed robots anyway sb.AppendLine("Allow: *"); // TODO: Use from Program.visibleUrlPrefix? sb.AppendLine($"Sitemap: {(context.Request.IsHttps ? "https" : "http")}://{context.Request.Host}/sitemap.xml"); context.Response.ContentType = "text/xml"; context.Response.Headers.Add("Encoding", "UTF-8"); context.Response.StatusCode = 200; await context.Response.WriteAsync(sb.ToString()); //context.Response.StatusCode = 404; res = true; } else if (context.Request.Path.ToString().Equals("/")) { // TODO: Move to SEOMgr logger.LogInformation($"Robot index output|{context.Connection.RemoteIpAddress}"); //// Dont index syntetic start page // robotsTag = "noindex"; context.Response.ContentType = "text/html"; context.Response.Headers.Add("Encoding", "UTF-8"); context.Response.Headers.Add("X-Robots-Tag", robotsTag); context.Response.StatusCode = 200; var repo = context.RequestServices.GetService(); // Create list with blogpost-links StringBuilder csb = new StringBuilder(); // // Stick items first, may contain important legal stuff // Which should no be seen by enduser but who knows // foreach (var p in repo.GetHeaders().Where(x => "PUBLIC".Equals(x.Access)) .Where(x => x.StickyMenuPos >= 0) .OrderBy(c => c.StickyMenuPos) .ThenBy(c => c.Title)) { csb.AppendLine($"
  • {InputSanitizer.Default.SanitizeText(p.Title, true)}"); } // // Now the articles them selfes // foreach (var p in repo.GetHeaders().Where(x => "PUBLIC".Equals(x.Access)).OrderBy(c => c.Title)) { csb.AppendLine($"
  • {InputSanitizer.Default.SanitizeText(p.Title, true)}"); } var htmlSanitizer = new HtmlSanitizer(); StringBuilder sb = new StringBuilder(); sb.AppendLine(""); sb.AppendLine(""); sb.AppendLine(""); sb.AppendLine(""); sb.AppendLine($"{blogName} - Mainpage"); sb.AppendLine($""); sb.AppendLine($""); sb.AppendLine(""); sb.AppendLine("

    Welcome to my blog


    Site contents / current articles:
      "); sb.AppendLine(htmlSanitizer.Sanitize(csb.ToString())); sb.AppendLine("
    "); sb.AppendLine(""); // Create final output await context.Response.WriteAsync(sb.ToString()); res = true; } else if (context.Request.Path.ToString().Equals("/sitemap.xml")) { // Dont index syntetic start page logger.LogInformation($"Robot sitemap.xml|{context.Connection.RemoteIpAddress}"); robotsTag = ""; context.Response.ContentType = "text/xml"; context.Response.Headers.Add("Encoding", "UTF-8"); //context.Response.Headers.Add("X-Robots-Tag", robotsTag); context.Response.StatusCode = 200; var sm = seoMgr.GetSiteMap(); await context.Response.WriteAsync(sm); res = true; } else if (context.Request.Path.ToString().StartsWith("/viewer/")) { var repo = context.RequestServices.GetService(); var id = context.Request.Path.ToString().Split("/").Last(); logger.LogInformation($"Robot post output|{context.Connection.RemoteIpAddress}|{id}"); var bp = repo.GetHeaders().Where(x => "PUBLIC".Equals(x.Access) && String.Equals(id, x.Id)).FirstOrDefault(); if (bp == null) { // Not found or no access context.Response.StatusCode = 404; } else { context.Response.Headers.Add("X-Robots-Tag", robotsTag); context.Response.StatusCode = 200; context.Response.ContentType = "text/html"; context.Response.Headers.Add("Encoding", "UTF-8"); StringBuilder sb = seoMgr.BuildBlogPostOutput(bp); await context.Response.WriteAsync(sb.ToString()); } res = true; } } return res; } } }