230 lines
9.9 KiB
C#
230 lines
9.9 KiB
C#
using Microsoft.AspNetCore.Http;
|
|
using Microsoft.Extensions.Primitives;
|
|
using Microsoft.Extensions.DependencyInjection;
|
|
using System;
|
|
using System.Collections.Generic;
|
|
using System.Linq;
|
|
using System.Text;
|
|
using System.Threading.Tasks;
|
|
using Ganss.XSS;
|
|
using System.IO;
|
|
using Markdig;
|
|
using Markdig.Syntax;
|
|
using Blog3000.Shared;
|
|
using Microsoft.Extensions.Logging;
|
|
|
|
namespace Blog3000.Server.MiddleWares
|
|
{
|
|
public class SearchBotHandler
|
|
{
|
|
/// <summary>
|
|
/// Default no SEO is allowed.
|
|
/// Two-Folded strategy: Prohibit via static robots.txt and http headers
|
|
/// On valid bots indexing may be allowed. The synthetics index + blosts are created, which
|
|
/// match the uri-scheme of the blazor-viewer on the client
|
|
/// </summary>
|
|
/// <param name="allowIndexing">allow valid bots like bing+google</param>
|
|
/// <param name="allowDevBot">allow a dev-test-box 'xx-testbot-xx'</param>
|
|
/// <returns></returns>
|
|
|
|
private readonly string blogName;
|
|
private readonly bool allowIndexing;
|
|
private readonly bool allowDevBot;
|
|
|
|
private readonly ILogger logger;
|
|
private readonly SEOMgr seoMgr;
|
|
|
|
public SearchBotHandler(ILogger<SearchBotHandler> logger, BlogEnv blogEnv, SEOMgr seoMgr)
|
|
{
|
|
this.logger = logger;
|
|
this.seoMgr = seoMgr;
|
|
this.blogName = blogEnv?.BlogConfig?.Title ?? "Blog3000 Instance";
|
|
this.allowIndexing = blogEnv.BlogConfig.AllowSearchBots;
|
|
this.allowDevBot = blogEnv.BlogConfig.AllowSearchDevBot;
|
|
}
|
|
|
|
|
|
public async Task<bool> Process(HttpContext context)
|
|
{
|
|
string robotsTag = "noindex, nofollow";
|
|
StringValues uas;
|
|
bool isAllowedRobot = false;
|
|
bool allowForAll = false;
|
|
bool res = false; // Allow passing the pipeline
|
|
|
|
// Check if it an allowed robot first
|
|
if (context.Request.Headers.TryGetValue("User-Agent", out uas))
|
|
{
|
|
//logger.LogInformation($"User-Agent|{uas}");
|
|
|
|
foreach (var ua in uas)
|
|
{
|
|
var lua = ua?.ToLowerInvariant() ?? "";
|
|
if ((allowIndexing) && (lua.Contains("googlebot"))
|
|
|| ((allowIndexing) && (lua.Contains("bingbot")))
|
|
|| ((allowDevBot) && (lua.Contains("xx-testbot-xx")))
|
|
)
|
|
{
|
|
logger.LogInformation($"Robot detected|{context.Connection.RemoteIpAddress}|{lua}");
|
|
robotsTag = "noarchive, noimageindex, max-image-preview:none, max-snippet:50";
|
|
isAllowedRobot = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
//if (context.Request.Path.ToString().Equals("/sitemap.xml"))
|
|
//{
|
|
// logger.LogInformation($"Sitemap request|{context.Connection.RemoteIpAddress}|Allowing for all useragents, now: {uas}");
|
|
// allowForAll = true;
|
|
//}
|
|
|
|
if (!isAllowedRobot && !allowForAll)
|
|
{
|
|
|
|
//if ((allowIndexing || allowDevBot) && (context.Request.Path.ToString().Equals("/")))
|
|
//{
|
|
// logger.LogInformation($"Allow index for / for all UA |{context.Connection.RemoteIpAddress}|Useragent {uas}");
|
|
// res = false;
|
|
//}
|
|
//else if ((allowIndexing || allowDevBot) && (context.Request.Path.ToString().Equals("/robots.txt")))
|
|
//{
|
|
// logger.LogInformation($"Suppressing negative robots for all UA|{context.Connection.RemoteIpAddress}|Useragent {uas}");
|
|
// context.Response.StatusCode = 404;
|
|
// res = true;
|
|
//}
|
|
//else
|
|
{
|
|
context.Response.Headers.Add("X-Robots-Tag", robotsTag);
|
|
res = false;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if (context.Request.Path.ToString().StartsWith("/robots.txt"))
|
|
{
|
|
logger.LogInformation($"Robot robots.txt output|{context.Connection.RemoteIpAddress}");
|
|
StringBuilder sb = new StringBuilder();
|
|
sb.AppendLine("User-Agent: *"); // Can be wildcard, since we create this only for allowed robots anyway
|
|
sb.AppendLine("Allow: *");
|
|
|
|
// TODO: Use from Program.visibleUrlPrefix?
|
|
sb.AppendLine($"Sitemap: {(context.Request.IsHttps ? "https" : "http")}://{context.Request.Host}/sitemap.xml");
|
|
|
|
context.Response.ContentType = "text/xml";
|
|
context.Response.Headers.Add("Encoding", "UTF-8");
|
|
context.Response.StatusCode = 200;
|
|
|
|
await context.Response.WriteAsync(sb.ToString());
|
|
//context.Response.StatusCode = 404;
|
|
res = true;
|
|
}
|
|
else if (context.Request.Path.ToString().Equals("/"))
|
|
{
|
|
// TODO: Move to SEOMgr
|
|
|
|
logger.LogInformation($"Robot index output|{context.Connection.RemoteIpAddress}");
|
|
|
|
//// Dont index syntetic start page
|
|
// robotsTag = "noindex";
|
|
|
|
context.Response.ContentType = "text/html";
|
|
context.Response.Headers.Add("Encoding", "UTF-8");
|
|
context.Response.Headers.Add("X-Robots-Tag", robotsTag);
|
|
context.Response.StatusCode = 200;
|
|
|
|
var repo = context.RequestServices.GetService<BlogPostRepo>();
|
|
|
|
// Create list with blogpost-links
|
|
|
|
|
|
StringBuilder csb = new StringBuilder();
|
|
|
|
//
|
|
// Stick items first, may contain important legal stuff
|
|
// Which should no be seen by enduser but who knows
|
|
//
|
|
foreach (var p in repo.GetHeaders().Where(x => "PUBLIC".Equals(x.Access))
|
|
.Where(x => x.StickyMenuPos >= 0)
|
|
.OrderBy(c => c.StickyMenuPos)
|
|
.ThenBy(c => c.Title))
|
|
{
|
|
csb.AppendLine($"<li><a href='viewer/{p.Id}'>{InputSanitizer.Default.SanitizeText(p.Title, true)}</a>");
|
|
}
|
|
|
|
//
|
|
// Now the articles them selfes
|
|
//
|
|
foreach (var p in repo.GetHeaders().Where(x => "PUBLIC".Equals(x.Access)).OrderBy(c => c.Title))
|
|
{
|
|
csb.AppendLine($"<li><a href='viewer/{p.Id}'>{InputSanitizer.Default.SanitizeText(p.Title, true)}</a>");
|
|
}
|
|
|
|
var htmlSanitizer = new HtmlSanitizer();
|
|
|
|
StringBuilder sb = new StringBuilder();
|
|
sb.AppendLine("");
|
|
sb.AppendLine("");
|
|
sb.AppendLine("<html>");
|
|
sb.AppendLine("<head>");
|
|
sb.AppendLine($"<title>{blogName} - Mainpage</title>");
|
|
sb.AppendLine($"<meta name='description' content='Index page of blog'></meta>");
|
|
sb.AppendLine($"<meta name='keywords' content='MainPage,Index,Content'></meta>");
|
|
sb.AppendLine("</head>");
|
|
sb.AppendLine("<body><h1>Welcome to my blog</h1><hr>Site contents / current articles:<ul>");
|
|
sb.AppendLine(htmlSanitizer.Sanitize(csb.ToString()));
|
|
sb.AppendLine("</ul></body>");
|
|
sb.AppendLine("</html>");
|
|
|
|
// Create final output
|
|
await context.Response.WriteAsync(sb.ToString());
|
|
res = true;
|
|
}
|
|
else if (context.Request.Path.ToString().Equals("/sitemap.xml"))
|
|
{
|
|
// Dont index syntetic start page
|
|
logger.LogInformation($"Robot sitemap.xml|{context.Connection.RemoteIpAddress}");
|
|
robotsTag = "";
|
|
|
|
context.Response.ContentType = "text/xml";
|
|
context.Response.Headers.Add("Encoding", "UTF-8");
|
|
//context.Response.Headers.Add("X-Robots-Tag", robotsTag);
|
|
context.Response.StatusCode = 200;
|
|
|
|
var sm = seoMgr.GetSiteMap();
|
|
|
|
|
|
await context.Response.WriteAsync(sm);
|
|
res = true;
|
|
}
|
|
else if (context.Request.Path.ToString().StartsWith("/viewer/"))
|
|
{
|
|
var repo = context.RequestServices.GetService<BlogPostRepo>();
|
|
var id = context.Request.Path.ToString().Split("/").Last();
|
|
|
|
logger.LogInformation($"Robot post output|{context.Connection.RemoteIpAddress}|{id}");
|
|
|
|
var bp = repo.GetHeaders().Where(x => "PUBLIC".Equals(x.Access) && String.Equals(id, x.Id)).FirstOrDefault();
|
|
if (bp == null)
|
|
{
|
|
// Not found or no access
|
|
context.Response.StatusCode = 404;
|
|
}
|
|
else
|
|
{
|
|
context.Response.Headers.Add("X-Robots-Tag", robotsTag);
|
|
context.Response.StatusCode = 200;
|
|
context.Response.ContentType = "text/html";
|
|
context.Response.Headers.Add("Encoding", "UTF-8");
|
|
|
|
StringBuilder sb = seoMgr.BuildBlogPostOutput(bp);
|
|
await context.Response.WriteAsync(sb.ToString());
|
|
}
|
|
res = true;
|
|
}
|
|
}
|
|
|
|
return res;
|
|
}
|
|
|
|
}
|
|
} |