using Microsoft.AspNetCore.Http; using Microsoft.Extensions.Primitives; using Microsoft.Extensions.DependencyInjection; using System; using System.Net; using System.Collections.Generic; using System.Linq; using System.Text; using System.Threading.Tasks; using Ganss.XSS; using System.IO; using Markdig; using Markdig.Syntax; using Blog3000.Shared; using Microsoft.Extensions.Logging; using AngleSharp; using System.Net.Http; using Microsoft.CodeAnalysis.CSharp.Syntax; using Blog3000.Client; namespace Blog3000.Server.MiddleWares { public class SEOMgr: IDisposable { private readonly ILogger logger; private readonly BlogEnv blogEnv; private readonly BlogPostRepo blogPostRepo; private readonly string urlPrefix; private readonly static object siteMapLock = new object(); private readonly string sitemapCacheFile; private string siteMapCache; // // Update when SEO-Output changes // private static DateTime MINSEODATE = new DateTime(2020, 07, 03); public SEOMgr(ILogger logger, BlogEnv blogEnv, BlogPostRepo blogPostRepo) { this.logger = logger; this.blogEnv = blogEnv; this.blogPostRepo = blogPostRepo; this.urlPrefix = blogEnv.BlogConfig.VisibleUrlPrefix; this.sitemapCacheFile = System.IO.Path.Combine( (string)AppDomain.CurrentDomain.GetData("AppDataPath"), "sitemapcache.xml"); blogPostRepo.BlogPostsChanged += BlogPostRepo_BlogPostsChanged; BlogPostRepo_BlogPostsChanged(null, null); } public void Dispose() { blogPostRepo.BlogPostsChanged -= BlogPostRepo_BlogPostsChanged; } public string GetSiteMap() { if (siteMapCache == null) { UpdateSiteMap(); } return siteMapCache; } public StringBuilder BuildBlogPostOutput(BlogPostHeader bp) { string markdown = blogPostRepo.GetText(bp); var title = InputSanitizer.Default.SanitizeText(bp.Title, true); var keywords = ""; if (bp.Topics != null) { foreach (var t in bp.Topics) { keywords += (t.Replace("'", "").Replace("#", "")) + ","; // TODO } } keywords = InputSanitizer.Default.SanitizeText(keywords); var abstr = "" + bp.Abstract; abstr = InputSanitizer.Default.SanitizeText(abstr, true).Replace("'", ""); // TODO string author = null; if (!String.IsNullOrEmpty(bp.Author)) { blogEnv.BlogConfig?.AuthorFullnames?.TryGetValue(bp.Author, out author); author = InputSanitizer.Default.SanitizeText(author, true); } var mdPipeline = new MarkdownPipelineBuilder().UseAdvancedExtensions().Build(); var mdoc = Markdig.Markdown.Parse(markdown, mdPipeline); // Extract imagerefs for rewriting Dictionary imgGeysByUrls = new Dictionary(); foreach (var r in mdoc.GetLinkReferenceDefinitions().Links) { if (r.Value != null && r.Value.Url != null) { imgGeysByUrls.Add(r.Value.Url, r.Key); } } // Render markdown, rewrite image-urls var writer = new StringWriter(); var renderer = new Markdig.Renderers.HtmlRenderer(writer); renderer.LinkRewriter += (a) => { string imgKey; if (imgGeysByUrls.TryGetValue(a, out imgKey)) { return $"BlogPosts/{bp.Id}/imgref/{a}"; } return a; }; mdPipeline.Setup(renderer); renderer.Render(mdoc); // Create sanitzed html var html = writer.ToString(); html = InputSanitizer.Default.SanitizeHtml(html); var lang = "en"; if (!String.IsNullOrEmpty(bp.Lang)) { lang = InputSanitizer.Default.SanitizeText(bp.Lang); } // Build final document StringBuilder sb = new StringBuilder(); sb.AppendLine(""); sb.AppendLine($""); sb.AppendLine(""); sb.AppendLine($"{title}"); sb.AppendLine(""); sb.AppendLine(""); sb.AppendLine($""); sb.AppendLine($""); sb.AppendLine($""); if (!String.IsNullOrEmpty(author)) { sb.AppendLine($""); } sb.AppendLine(""); sb.AppendLine(""); sb.AppendLine($"

{title}

"); sb.AppendLine($"
"); sb.AppendLine($"
Abstract: {abstr}
"); if (!String.IsNullOrEmpty(author)) { sb.AppendLine($"
Author: {author}
"); } sb.AppendLine("
"); sb.AppendLine(html); sb.AppendLine(""); return sb; } private string CreateSitemap() { // Create list with blogpost-links StringBuilder sb = new StringBuilder(); sb.AppendLine(""); sb.AppendLine(""); DateTime? latestTime = null; // Articles foreach (var p in blogPostRepo.GetHeaders().Where(x => "PUBLIC".Equals(x.Access)).OrderBy(x => x.Id)) { sb.AppendLine(" "); sb.AppendLine($" {urlPrefix}/viewer/{p.Id}"); var dt = blogPostRepo.GetFileModDate(p); if (dt != null) { if (MINSEODATE.CompareTo(dt) > 0) dt = MINSEODATE; sb.AppendLine($" {((DateTime)dt).ToString("o")}"); if (latestTime == null) latestTime = dt; else if (dt.Value.CompareTo(latestTime) > 0) latestTime = dt; } sb.AppendLine(" weekly"); sb.AppendLine(" "); } // Index page sb.AppendLine(" "); sb.AppendLine($" {urlPrefix}/"); sb.AppendLine(" weekly"); sb.AppendLine(" 0.1"); if (latestTime != null) { sb.AppendLine($" {((DateTime)latestTime).ToString("o")}"); } sb.AppendLine(" "); // Done sb.AppendLine(""); return sb.ToString(); } private bool isUpdatingSitemap = false; private void UpdateSiteMap() { lock (siteMapLock) { // Check if this thread is already rebuilding the sidemap and skip. // (Most importang skips dual-notifiy of crawlers) // C#-lock can be reentered by the thread keeping the lock. // This may occurr here, when the BlogPostRepo calls changed, which // is triggered by accessing the BlogPostRepo during the // Sitemap-creation. if (!isUpdatingSitemap) { isUpdatingSitemap = true; try { var oldSm = siteMapCache; if (String.IsNullOrEmpty(oldSm)) { oldSm = LoadSitemapCache(); } logger.LogInformation("Rebuilding Sitemap"); var sm = CreateSitemap(); siteMapCache = sm; SaveSitemapCache(sm); if (!String.Equals(sm, oldSm)) { logger.LogInformation("Sitemap changed"); Task.Run(async () => { await PingCrawlersAsync(); }); } else { logger.LogInformation("Sitemap not changed"); } } finally { isUpdatingSitemap = false; } } else { logger.LogInformation("Already rebuilding Sitemap. Skip"); } } } private async Task PingCrawlersAsync() { if (blogEnv.BlogConfig.AllowSearchBots || blogEnv.BlogConfig.AllowSearchDevBot) { // Notify google var n1 = Task.Run(async () => { using (var httpClient = new HttpClient()) { /*WebUtility.UrlEncode*/ var u = Uri.EscapeUriString($"http://www.google.com/ping?sitemap={(urlPrefix + "/sitemap.xml")}"); logger.LogInformation($"Notifying crawlers: Google ({u})"); using (var response = httpClient.GetAsync(u)) { string code = response.Result.StatusCode.ToString(); string apiResponse = await response.Result.Content.ReadAsStringAsync(); logger.LogInformation($"Google responded: {code} / {apiResponse}"); } } }); // Notify bing var n2 = Task.Run(async () => { using (var httpClient = new HttpClient()) { /*WebUtility.UrlEncode*/ var u = Uri.EscapeUriString($"http://www.bing.com/ping?sitemap={(urlPrefix + "/sitemap.xml")}"); logger.LogInformation($"Notifying crawlers: Bing ({u})"); using (var response = httpClient.GetAsync(u)) { string code = response.Result.StatusCode.ToString(); string apiResponse = await response.Result.Content.ReadAsStringAsync(); logger.LogInformation($"Bing responded: {code} / {apiResponse}"); } } }); await Task.WhenAll(n1, n2); return true; } return false; } private string LoadSitemapCache() { try { if (System.IO.File.Exists(sitemapCacheFile)) { return System.IO.File.ReadAllText(sitemapCacheFile); } } catch (Exception ex) { logger.LogWarning(ex, $"Error reading sitemapCacheFile ${sitemapCacheFile}"); } return null; } private bool SaveSitemapCache(string sitemap) { try { System.IO.File.WriteAllText(sitemapCacheFile, sitemap); return true; } catch (Exception ex) { logger.LogWarning(ex, $"Error writing sitemapCacheFile file ${sitemapCacheFile}"); } return false; } private void BlogPostRepo_BlogPostsChanged(object sender, EventArgs e) { UpdateSiteMap(); } } }