blog3000/Blog3000/Server/MiddleWares/SearchBotHandler.cs

230 lines
9.9 KiB
C#

using Microsoft.AspNetCore.Http;
using Microsoft.Extensions.Primitives;
using Microsoft.Extensions.DependencyInjection;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using Ganss.XSS;
using System.IO;
using Markdig;
using Markdig.Syntax;
using Blog3000.Shared;
using Microsoft.Extensions.Logging;
namespace Blog3000.Server.MiddleWares
{
public class SearchBotHandler
{
/// <summary>
/// Default no SEO is allowed.
/// Two-Folded strategy: Prohibit via static robots.txt and http headers
/// On valid bots indexing may be allowed. The synthetics index + blosts are created, which
/// match the uri-scheme of the blazor-viewer on the client
/// </summary>
/// <param name="allowIndexing">allow valid bots like bing+google</param>
/// <param name="allowDevBot">allow a dev-test-box 'xx-testbot-xx'</param>
/// <returns></returns>
private readonly string blogName;
private readonly bool allowIndexing;
private readonly bool allowDevBot;
private readonly ILogger logger;
private readonly SEOMgr seoMgr;
public SearchBotHandler(ILogger<SearchBotHandler> logger, BlogEnv blogEnv, SEOMgr seoMgr)
{
this.logger = logger;
this.seoMgr = seoMgr;
this.blogName = blogEnv?.BlogConfig?.Title ?? "Blog3000 Instance";
this.allowIndexing = blogEnv.BlogConfig.AllowSearchBots;
this.allowDevBot = blogEnv.BlogConfig.AllowSearchDevBot;
}
public async Task<bool> Process(HttpContext context)
{
string robotsTag = "noindex, nofollow";
StringValues uas;
bool isAllowedRobot = false;
bool allowForAll = false;
bool res = false; // Allow passing the pipeline
// Check if it an allowed robot first
if (context.Request.Headers.TryGetValue("User-Agent", out uas))
{
//logger.LogInformation($"User-Agent|{uas}");
foreach (var ua in uas)
{
var lua = ua?.ToLowerInvariant() ?? "";
if ((allowIndexing) && (lua.Contains("googlebot"))
|| ((allowIndexing) && (lua.Contains("bingbot")))
|| ((allowDevBot) && (lua.Contains("xx-testbot-xx")))
)
{
logger.LogInformation($"Robot detected|{context.Connection.RemoteIpAddress}|{lua}");
robotsTag = "noarchive, noimageindex, max-image-preview:none, max-snippet:50";
isAllowedRobot = true;
}
}
}
//if (context.Request.Path.ToString().Equals("/sitemap.xml"))
//{
// logger.LogInformation($"Sitemap request|{context.Connection.RemoteIpAddress}|Allowing for all useragents, now: {uas}");
// allowForAll = true;
//}
if (!isAllowedRobot && !allowForAll)
{
//if ((allowIndexing || allowDevBot) && (context.Request.Path.ToString().Equals("/")))
//{
// logger.LogInformation($"Allow index for / for all UA |{context.Connection.RemoteIpAddress}|Useragent {uas}");
// res = false;
//}
//else if ((allowIndexing || allowDevBot) && (context.Request.Path.ToString().Equals("/robots.txt")))
//{
// logger.LogInformation($"Suppressing negative robots for all UA|{context.Connection.RemoteIpAddress}|Useragent {uas}");
// context.Response.StatusCode = 404;
// res = true;
//}
//else
{
context.Response.Headers.Add("X-Robots-Tag", robotsTag);
res = false;
}
}
else
{
if (context.Request.Path.ToString().StartsWith("/robots.txt"))
{
logger.LogInformation($"Robot robots.txt output|{context.Connection.RemoteIpAddress}");
StringBuilder sb = new StringBuilder();
sb.AppendLine("User-Agent: *"); // Can be wildcard, since we create this only for allowed robots anyway
sb.AppendLine("Allow: *");
// TODO: Use from Program.visibleUrlPrefix?
sb.AppendLine($"Sitemap: {(context.Request.IsHttps ? "https" : "http")}://{context.Request.Host}/sitemap.xml");
context.Response.ContentType = "text/xml";
context.Response.Headers.Add("Encoding", "UTF-8");
context.Response.StatusCode = 200;
await context.Response.WriteAsync(sb.ToString());
//context.Response.StatusCode = 404;
res = true;
}
else if (context.Request.Path.ToString().Equals("/"))
{
// TODO: Move to SEOMgr
logger.LogInformation($"Robot index output|{context.Connection.RemoteIpAddress}");
//// Dont index syntetic start page
// robotsTag = "noindex";
context.Response.ContentType = "text/html";
context.Response.Headers.Add("Encoding", "UTF-8");
context.Response.Headers.Add("X-Robots-Tag", robotsTag);
context.Response.StatusCode = 200;
var repo = context.RequestServices.GetService<BlogPostRepo>();
// Create list with blogpost-links
StringBuilder csb = new StringBuilder();
//
// Stick items first, may contain important legal stuff
// Which should no be seen by enduser but who knows
//
foreach (var p in repo.GetHeaders().Where(x => "PUBLIC".Equals(x.Access))
.Where(x => x.StickyMenuPos >= 0)
.OrderBy(c => c.StickyMenuPos)
.ThenBy(c => c.Title))
{
csb.AppendLine($"<li><a href='viewer/{p.Id}'>{InputSanitizer.Default.SanitizeText(p.Title, true)}</a>");
}
//
// Now the articles them selfes
//
foreach (var p in repo.GetHeaders().Where(x => "PUBLIC".Equals(x.Access)).OrderBy(c => c.Title))
{
csb.AppendLine($"<li><a href='viewer/{p.Id}'>{InputSanitizer.Default.SanitizeText(p.Title, true)}</a>");
}
var htmlSanitizer = new HtmlSanitizer();
StringBuilder sb = new StringBuilder();
sb.AppendLine("");
sb.AppendLine("");
sb.AppendLine("<html>");
sb.AppendLine("<head>");
sb.AppendLine($"<title>{blogName} - Mainpage</title>");
sb.AppendLine($"<meta name='description' content='Index page of blog'></meta>");
sb.AppendLine($"<meta name='keywords' content='MainPage,Index,Content'></meta>");
sb.AppendLine("</head>");
sb.AppendLine("<body><h1>Welcome to my blog</h1><hr>Site contents / current articles:<ul>");
sb.AppendLine(htmlSanitizer.Sanitize(csb.ToString()));
sb.AppendLine("</ul></body>");
sb.AppendLine("</html>");
// Create final output
await context.Response.WriteAsync(sb.ToString());
res = true;
}
else if (context.Request.Path.ToString().Equals("/sitemap.xml"))
{
// Dont index syntetic start page
logger.LogInformation($"Robot sitemap.xml|{context.Connection.RemoteIpAddress}");
robotsTag = "";
context.Response.ContentType = "text/xml";
context.Response.Headers.Add("Encoding", "UTF-8");
//context.Response.Headers.Add("X-Robots-Tag", robotsTag);
context.Response.StatusCode = 200;
var sm = seoMgr.GetSiteMap();
await context.Response.WriteAsync(sm);
res = true;
}
else if (context.Request.Path.ToString().StartsWith("/viewer/"))
{
var repo = context.RequestServices.GetService<BlogPostRepo>();
var id = context.Request.Path.ToString().Split("/").Last();
logger.LogInformation($"Robot post output|{context.Connection.RemoteIpAddress}|{id}");
var bp = repo.GetHeaders().Where(x => "PUBLIC".Equals(x.Access) && String.Equals(id, x.Id)).FirstOrDefault();
if (bp == null)
{
// Not found or no access
context.Response.StatusCode = 404;
}
else
{
context.Response.Headers.Add("X-Robots-Tag", robotsTag);
context.Response.StatusCode = 200;
context.Response.ContentType = "text/html";
context.Response.Headers.Add("Encoding", "UTF-8");
StringBuilder sb = seoMgr.BuildBlogPostOutput(bp);
await context.Response.WriteAsync(sb.ToString());
}
res = true;
}
}
return res;
}
}
}