مساعدة:بوت استخراج البيانات/الكود

#!/usr/bin/python
# -*- coding: utf-8 -*-
#
# BY: رضا (User:reza1615 on fa.wikipedia)
# BY: Z (User:ZxxZxxZ on fa.wikipedia)
# Distributed under the terms of the CC-BY-SA 3.0 .    
"""
 
You can run the bot with the following commandline parameters:
 
-file        - Work on all pages given in a local text file.
               Will read any [[wiki link]] and use these articles.
               Argument can also be given as "-file:filename".
-cat         - Work on all pages which are in a specific category.
               Argument can also be given as "-cat:categoryname".
-page        - Only edit a specific page.
               Argument can also be given as "-page:pagetitle". You can give this
               parameter multiple times to edit multiple pages.
-ref         - Work on all pages that link to a certain page.
               Argument can also be given as "-ref:referredpagetitle".
-filelinks   - Works on all pages that link to a certain image.
               Argument can also be given as "-filelinks:ImageName".
-links       - Work on all pages that are linked to from a certain page.
               Argument can also be given as "-links:linkingpagetitle".
-start       - Work on all pages in the wiki, starting at a given page. Choose
               "-start:!" to start at the beginning.
               NOTE: You are advised to use -xml instead of this option; this is
               meant for cases where there is no recent XML dump.
               i.e. "-start:Category:!" or "-start:template:a"
-except:XYZ  - Ignore pages which contain XYZ. If the -regex argument is given,
               XYZ will be regarded as a regular expression.
-summary:XYZ - Set the summary message text for the edit to XYZ, bypassing the
               predefined message texts with original and replacements inserted.
-template:XYZ- 
-namespace:n - Number of namespace to process. The parameter can be used
               multiple times. It works in combination with all other
               parameters, except for the -start parameter. If you e.g. want to
               iterate over all user pages starting at User:M, use
               -start:User:M.
-always      - Don't prompt you for each replacement
-save          save on wikifa
-nofa          save infobaxs that they don't have any article in fa.wiki
other:       - 
 
NOTE: Only use either -xml or -file or -cat or -template or-page, but don't mix them.
 
"""
try:
    import MySQLdb
except:
    pass    
import wikipedia,sys
import pagegenerators
import re, os, codecs, catlib,string
import query
from collections import defaultdict
wikipedia.config.put_throttle = 0
wikipedia.put_throttle.setDelay()
faSite = wikipedia.getSite('ar')
enSite = wikipedia.getSite('en')
txtTmp=' '
adressfa=u' '
msg = u'بوت نقل قالب المعلومات'
savewiki= False    
nofa='onlyen'
boxes=[u'infobox',u'Geobox']
def englishdictionry( enlink ,firstsite,secondsite,check):
    try:
        enlink=unicode(str(enlink),'UTF-8').replace(u'[[',u'').replace(u']]',u'').replace(u'en:',u'').replace(u'ar:',u'')
    except:
        enlink=enlink.replace(u'[[',u'').replace(u']]',u'').replace(u'en:',u'').replace(u'ar:',u'')
    enlink=enlink.split(u'#')[0].strip()
    if enlink==u'':
        return False    
    enlink=enlink.replace(u' ',u'_')
    site = wikipedia.getSite(firstsite)
    sitesecond= wikipedia.getSite(secondsite)
    params = {
        'action': 'query',
        'prop': 'langlinks',
        'titles': enlink,
        'redirects': 1,
        'lllimit':500,
    }
    try:
        categoryname = query.GetData(params,site, encodeTitle = True)  
        for item in categoryname[u'query'][u'pages']:
            case=categoryname[u'query'][u'pages'][item][u'langlinks']
        for item in case:
            if item[u'lang']==secondsite:
                intersec=item[u'*']
                break
        if check==True:
            secondsitep = wikipedia.Page( sitesecond,intersec)
            try:
                text = secondsitep.get()    
            except wikipedia.NoPage:
                    wikipedia.output(u"%s doesn't exist, skip!" % page.title())
                    return False
            except wikipedia.IsRedirectPage:
                    wikipedia.output(u"%s is a redirect, skip!" % page.title())
                    newpage = secondsitep.getRedirectTarget()
                    intersec=newpage.title()
        result=intersec    
        return result
    except: 
        return False
def redirectquery( enlink,firstsite):
    try:
        enlink=unicode(str(enlink),'UTF-8').replace(u'[[',u'').replace(u']]',u'').replace(u'en:',u'').replace(u'ar:',u'')
    except:
        enlink=enlink.replace(u'[[',u'').replace(u']]',u'').replace(u'en:',u'').replace(u'ar:',u'')
    enlink=enlink.split(u'#')[0].strip()
    if enlink==u'':
        return False    
    enlink=enlink.replace(u' ',u'_')
    site = wikipedia.getSite(firstsite)
    params = {
        'action': 'query',
        'titles': enlink,
        'redirects':1}
    try:
        redirectname = query.GetData(params,site, encodeTitle = True)
        redirectname=redirectname[u'query'][u'redirects'][0]['to'].replace(u'_',u' ')
        return redirectname
    except:
        enlink=enlink.replace(u'_',u' ')
        return enlink
        
def categorydown(listacategory):
    wikipedia.config.put_throttle = 0
    wikipedia.put_throttle.setDelay()
    count=1
    for catname in listacategory:
        count+=1
        if count==200:
            break
        gencat = pagegenerators.SubCategoriesPageGenerator(catname, recurse=False)
        try:
            for subcat in gencat:
                try:
                   wikipedia.output(str(subcat))
                except:    
                    wikipedia.output(subcat)
                if subcat in listacategory:
                    continue
                else:
                    listacategory.append(subcat)
        except:
            continue
    return listacategory
def pagefafinder(encatTitle):

    cats=[]
    try:    
        item=unicode(str(encatTitle),'Ascii').replace('[[en:','').replace(']]','').replace(' ','_').replace('Category:','')
    except:
        item=str(encatTitle).replace('[[en:','').replace(']]','').replace(' ','_').replace('Category:','')
    #-----------------start sql---------------------------------------
    queries ='SELECT /* SLOW_OK */ ll_title  FROM page JOIN categorylinks JOIN langlinks WHERE cl_to = "'+item+'" AND cl_from=page_id AND page_namespace = 0 AND page_id =ll_from AND ll_lang = "ar" AND page_namespace = 0 GROUP BY ll_title ;' 
    site1 = wikipedia.getSite('en')
    TS_DB_HOST = 'sql-s3'
    MY_CNF = '~/.my.cnf'
    cn = MySQLdb.connect("enwiki.labsdb", db = site1.dbName(), read_default_file = '~/.my.cnf')
    cur = cn.cursor()    
    cur.execute(queries)
    results = cur.fetchall()
    cn.close()
    #------------------end of sql--------------------------------------------
    for raw in results:
       cats.append(raw)
    if cats!=[]:
       return cats
    else:
       return False
       
def encatlist(encat):
    encat=encat.replace(u'Category:Category:',u'Category:')
    wikipedia.config.put_throttle = 0
    wikipedia.put_throttle.setDelay()
    count=0
    listenpageTitle=[]
    PageTitle=encat.replace(u'[[',u'').replace(u']]',u'').strip()
    language='en'
    PageTitles =[PageTitle]  
    for PageTitle in PageTitles:
        cat = catlib.Category( wikipedia.getSite(language),PageTitle )
        if str(cat).find('stubs')!=-1:
            continue
        listacategory=[cat]
        listacategory=categorydown(listacategory)
        for enpageTitle in listacategory:
                try:
                   fapages=pagefafinder(enpageTitle)
                   for pages in fapages:
                              pages=unicode(pages[0],'UTF-8')
                              wikipedia.output(u'\03{lightgreen}Adding '+pages+u' to fapage lists\03{default}')
                              listenpageTitle.append(pages)
                except:
                   
                   try:    
                        enpageTitle=unicode(str(enpageTitle),'UTF-8').split(u'|')[0].split(u']]')[0].replace(u'[[',u'').strip()
                   except:
                        enpageTitle=enpageTitle.split(u'|')[0].split(u']]')[0].replace(u'[[',u'').strip()
                   cat = catlib.Category( wikipedia.getSite(language),enpageTitle )
                   gent = pagegenerators.CategorizedPageGenerator( cat )
                   for pagework in gent:
                      count+=1
                      try:
                          link=str(pagework).split(u'|')[0].split(u']]')[0].replace(u'[[',u'').strip()
                      except:
                          pagework=unicode(str(pagework),'UTF-8')
                          link=pagework.split(u'|')[0].split(u']]')[0].replace(u'[[',u'').strip()
                      listenpageTitle.append(link)
    if listenpageTitle==[]:
        return False
    return listenpageTitle
def boxfind(text_en):
    text_en=text_en.replace(u'{{ ',u'{{').replace(u'{{ ',u'{{').replace(u'{{template:',u'{{').replace(u'{{Template:',u'{{')
    lines=text_en.split('\n')
    start=False    
    box=u'\n'
    diff=1
    linebaz,linebasteh=0,0
    for our_box in boxes:
        our_box=our_box.strip()
        up_our_box=our_box[0].upper()+our_box[1:]
        lower_our_box=our_box[0].lower()+our_box[1:]
        for line in lines:
            if line==u'':
                continue
            if line.find(lower_our_box)!=-1 :# lower case    
                start=True
                linebaz,linebasteh=0,0
                box+=u'{{'+lower_our_box+line.split(u'{{'+lower_our_box)[1]+'\n'
                linebaz += string.count( line,"{{" )
                linebasteh += string.count( line,"}}" )    
                diff=linebaz-linebasteh
                continue
            if line.find(up_our_box)!=-1 :# upper case
                start=True
                linebaz,linebasteh=0,0
                box+=u'{{'+up_our_box+line.split(u'{{'+up_our_box)[1]+'\n'
                linebaz += string.count( line,"{{" )
                linebasteh += string.count( line,"}}" )
                diff=linebaz-linebasteh
                continue
            if start==True and diff!=0:
                linebaz += string.count( line,"{{" )
                linebasteh += string.count( line,"}}" )
                diff=linebaz-linebasteh
                box+=line+'\n'
            if diff==0 and start==True:
                break
    return box
def BotRun(page,text_en,nofa,counting):
    wikipedia.output(u'------Artcile Number \03{lightblue}'+unicode(str(counting),'UTF-8')+u'\03{default} ----'+unicode(str(page),'UTF-8')+u'------------')
    faresult=englishdictionry( page ,'en','ar',False)
    if faresult==False and nofa=='onlyfa':
        wikipedia.output( u'\03{lightpurple}===>'+unicode(str(page),'UTF-8')+u' in fawiki has no article \03{default}')
        return False
    if faresult!=False and nofa=='onlyen':
        wikipedia.output( u'\03{lightpurple}===>'+unicode(str(page),'UTF-8')+u' in fawiki has article \03{default}')
        return False
    lines=text_en.split('\n')
    matn=' '
    for line in lines:
        linebaz=string.count(line,'{{')
        linebaste=string.count(line,'}}')
        diff=linebaz-linebaste
        if diff==0:
            line=line.replace('{{','$AAAA$').replace('}}','!BBBB!')
        linebaz=0
        linebaste=0
        matn+=line+u'\n'
    newtext=''
    for our_box in boxes:
        our_box=our_box.strip()
        try:
            newtext= re.search(ur'(\{\{\s*['+our_box[0].lower()+our_box[0].upper()+ur']'+our_box[1:]+ur'[_\s](?:\{\{.*?\}\}|[^\}])*\}\})',matn, re.S).group(1)# if Template box has other name please chang this regex
            newtext=newtext.replace(u'$AAAA$',u'{{').replace(u'!BBBB!',u'}}')
            break
        except:
            continue
    if not newtext.strip():
        newtext=boxfind(text_en)
    if not newtext.strip():
        wikipedia.output( u'===>'+unicode(str(page),'UTF-8')+u' Without Templatebox' )
        return False   
    wikipedia.output( u'\03{lightgreen}'+unicode(str(page),'UTF-8')+u" added to list \03{default}")
    newtext=u'\n@@@\n$$$'+unicode(str(page),'UTF-8')+u'$$$\n'+newtext+u'\n@@@\n'
    return newtext
def templatefinder(linkingPageTitle,nofa):
    articles=[]
    
    try:    
        item=unicode(str(linkingPageTitle),'Ascii').replace('[[en:','').replace(']]','').replace(' ','_').replace('Template:','').replace('template:','').replace('قالب:','')
    except:
        item=str(linkingPageTitle).replace('[[en:','').replace(']]','').replace(' ','_').replace('Template:','').replace('template:','').replace('قالب:','')
    #-----------------start sql---------------------------------------
    if nofa=='onlyfa':
        queries ='SELECT /* SLOW_OK */ page_title FROM page JOIN langlinks WHERE page_namespace = 0 AND page_is_redirect = 0 AND page_id IN (SELECT tl_from FROM templatelinks WHERE tl_title = "'+linkingPageTitle+'" AND tl_namespace = 10) AND  ll_lang = "ar"  GROUP BY page_title LIMIT 10;'
    else:
        a=1
    #some thing
    wikipedia.output(queries)
    site1 = wikipedia.getSite('en')
    TS_DB_HOST = 'sql-s3'
    MY_CNF = '~/.my.cnf'
    cn = MySQLdb.connect("enwiki.labsdb", db = site1.dbName(), read_default_file = '~/.my.cnf')
    cur = cn.cursor()    
    cur.execute(queries)
    results = cur.fetchall()
    cn.close()
    #------------------end of sql--------------------------------------------
    for raw in results:
       articles.append(raw)
    if articles!=[]:
       return articles
    else:
       return False
def run(generator,savewiki,adressfa,nofa):
        wikipedia.config.put_throttle = 0
        wikipedia.put_throttle.setDelay()
        site = wikipedia.getSite( 'en' )
        with codecs.open( u'tempresult.txt',mode = 'w',encoding = 'utf8' ) as fars:
                    fars.write( u'\n' )
        with codecs.open( u'encats.txt',mode = 'w',encoding = 'utf8' ) as fars2:
                    fars2.write( u'\n' )
        new_text='\n'
        counting=0
        for pageen in generator:
            try:
                enlink=unicode(str(pageen),'UTF-8').replace(u'[[',u'').replace(u']]',u'')
            except:
                enlink=pageen.replace(u'[[',u'').replace(u']]',u'')
            if enlink.find(u'talk:')!=-1 or enlink.find(u'Talk:')!=-1 or enlink.find(u'User:')!=-1 or enlink.find(u'Template:')!=-1 or enlink.find(u'Wikipedia:')!=-1 or enlink.find(u'Category:')!=-1:  
                continue
            page = wikipedia.Page( site,enlink )
            try:
                text_fa = page.get()
                catsen=page.categories()
            except wikipedia.NoPage:
                wikipedia.output( u'Page %s not found' % page.title() )
                continue
            except wikipedia.IsRedirectPage:
                 pageRedirect = page.getRedirectTarget()
                 text_fa = pageRedirect.get()
                 catsen=pageRedirect.categories()
                 wikipedia.output( u'Page %s was Redirect but edited!' %  pageRedirect )                
            except:
                 continue
            counting+=1
            new_text=BotRun(page,text_fa,nofa,counting)
            
            if new_text==False or new_text=='\n':
                 continue
            with codecs.open( u'tempresult.txt',mode = 'a',encoding = 'utf8' ) as fars:
                    fars.write( new_text )
            catsen=unicode(str(catsen),'UTF-8').replace(u'[Category{',u'').replace(u'}, Category{',u'\n').replace(u']]}]',u']]').replace(u'[[en:',u'[[:en:')+u'\n'
            with codecs.open( u'encats.txt',mode = 'a',encoding = 'utf8' ) as encatfile:
                    encatfile.write( catsen )
def linktranslation():
        farsichar=u'ابضصثقفغعهخحجچشسیلتنمکگظطزرذدپو۱۲۳۴۵۶۷۸۹۰'
        wikipedia.output(u'\03{lightgreen} Translating Links .... \03{default}')
        text2 = codecs.open( u'tempresult.txt','r' ,'utf8' )
        text = text2.read()
        linken = re.findall(ur'\[\[.*?\]\]',text, re.S)
        for item in linken:
                passport=True
                if not item in text:
                    continue
                if item.find(u'File:')!=-1 or item.find(u'file:')!=-1 or item.find(u'Image:')!=-1 or item.find(u'image:')!=-1 or item.find(u'Category:')!=-1 or item.find(u'category:')!=-1:
                    continue
                for i in farsichar:
                        if i in item:
                            passport=False
                            break
                if not passport:
                    continue    
    
                itemmain=item
                item=item.replace(u'en:',u'')
                if item.find('user:')!=-1 or item.find('User:')!=-1 or item.find('template:')!=-1 or item.find('Template:')!=-1 or item.find('category:')!=-1 or item.find('Category:')!=-1 or item.find('Wikipedia:')!=-1 or item.find('wikipedia:')!=-1 or item.find('Talk:')!=-1 or item.find('talk:')!=-1 or item.find('Help:')!=-1 or item.find('help:')!=-1:
                    continue
                itemen=item.split(u'|')[0].replace(u'[[',u'').replace(u']]',u'').strip()
                if text.find(itemmain)!=-1:
                    itemfa=englishdictionry(itemen ,'en','ar',False)
                    wikipedia.output(itemen)
                else:
                    continue
                if itemfa==False:
                    itemen=item.replace(u'[[',u'').replace(u']]',u'').strip()
                    itemen=itemen.replace(u'[[',u'').replace(u']]',u'')
                    text=text.replace(u'[['+itemen+u']]',u'@1@'+itemen+u'@2@')
                    continue
                else:
                    text=text.replace(itemmain,u'@1@'+itemfa+u'@2@')
                linken = re.findall(ur'\[\[.*?\]\]',text, re.S)
                wikipedia.output(str(len(linken))+u' is remained')
        text=text.replace(u'@1@',u'[[').replace(u'@2@',u']]')
        text=text.replace(u'$$$[[',u"'''").replace(u']]$$$',u"'''")
        text=text.replace(u'@@@\n\n@@@',u'@@@\n@@@').strip()
        with codecs.open( u'NeededBoxes.txt',mode = 'w',encoding = 'utf8' ) as f:
                    f.write(text)    
        wikipedia.output(u'\03{lightblue} Translated Boxes are copied in NeededBoxes.txt \03{default}')
def main(savewiki,adressfa,nofa):
    summary_commandline,template,gen = None,None,None
    exceptions,PageTitles,namespaces = [],[],[]
    linkingPageTitle=''
    encat=''
    autoText,autoTitle = False,False
    genFactory = pagegenerators.GeneratorFactory()
    wikipedia.setAction( msg )
    for arg in wikipedia.handleArgs():    
            if arg == '-autotitle':
                autoTitle = True
            elif arg == '-autotext':
                autoText = True
            elif arg.startswith( '-page:' ):
                if len(arg) == 6:
                    PageTitles.append(wikipedia.input( u'Which page do you want to chage?' ))
                else:
                    PageTitles.append(arg[6:])
            elif arg.startswith( '-cat:' ):
                if len(arg) == 5:
                    encat=wikipedia.input( u'Which Category do you want to chage?' )
                else:
                    encat='Category:'+arg[5:]
            elif arg.startswith('-except:'):
                exceptions.append(arg[8:])
            elif arg.startswith( '-namespace:' ):
                namespaces.append( int( arg[11:] ) )
            elif arg.startswith( '-ns:' ):
                namespaces.append( int( arg[4:] ) )
            elif arg.startswith( '-ref:' ):
                linkingPageTitle= arg[5:]
                
            elif arg.startswith('-file:'):
                textfilename = arg[6:]
                if not textfilename:
                    textfilename = wikipedia.input(
                        u'Please enter the local file name:')
                gen = pagegenerators.TextfilePageGenerator(textfilename,'en')
            
            elif arg.startswith( '-nofa:' ):
                nofa=arg[6:].strip()
                nofa=nofa.replace(u'faonly',u'onlyfa').replace(u'enonly',u'onlyen')
                if nofa!='onlyfa':
                    nofa='onlyen'
            elif arg.startswith( '-onlyfa' ):
                nofa='onlyfa'
            elif arg.startswith( '-onlyen' ):    
                nofa='onlyen'
            elif arg.startswith( '-save' ):
                savewiki= True
                adressfa= wikipedia.input(u'Write you wiki subpage like (User:yourusername/findbox) :> ').decode('utf-8')
            else:
                generator = genFactory.handleArg(arg)
                if generator:
                    gen = generator
    #--------------------------------------------------------------------the template name----------------
    #linkingPageTitle=u'Template:Infobox school'
    #--------------------------------------------------------------------------------------------------------
    if encat!='':
        encatfalist=encatlist(encat)    
        if encatfalist!=False:
            run(encatfalist,savewiki,adressfa,nofa)
    if PageTitles:
        pages = [wikipedia.Page(enSite,PageTitle) for PageTitle in PageTitles]
        gen = iter( pages )
    
    if linkingPageTitle:
            linkingPage = wikipedia.Page(enSite, linkingPageTitle)
            pages=pagegenerators.ReferringPageGenerator(linkingPage)
            gen = iter(pages)
            wikipedia.output(linkingPageTitle)
    if namespaces != []:
        gen = pagegenerators.NamespaceFilterPageGenerator( gen,namespaces )
    preloadingGen = pagegenerators.PreloadingGenerator( gen,pageNumber = 60 )
    run(preloadingGen,savewiki,adressfa,nofa)
def catenlist():
        wikipedia.output(u'\03{lightgreen} Listing Needed categories.... \03{default}')
        text2 = codecs.open( u'encats.txt','r' ,'utf8' )
        text = text2.read()
        linken = re.findall(ur'\[\[.*?\]\]',text, re.S)
        for item in linken:
                itemmain=item
                item=item.replace(u':en:',u'').replace(u'en:',u'')
                itemen=item.split(u'|')[0].replace(u'[[',u'').replace(u']]',u'').strip()
                if text.find(itemmain)!=-1:
                    itemfa=englishdictionry(itemen ,'en','ar',False)
                    wikipedia.output(itemen)
                else:
                    continue
                if itemfa==False:
                    itemen=item.replace(u'[[',u'').replace(u']]',u'').strip()
                    itemen=itemen.replace(u'[[',u'').replace(u']]',u'')
                    text=text.replace(u'[['+itemen+u']]',u'@1@'+itemen+u'@2@')
                    continue
                else:
                    text=text.replace(itemmain,u'')
                text=text.replace(u'[[]]',u'').replace(u'@1@@2@',u'')
                linken = re.findall(ur'\[\[.*?\]\]',text, re.S)
                wikipedia.output(str(len(linken))+u' is remained')
        text=text.replace(u'@1@',u'[[').replace(u'@2@',u']]')
        textup=u"التصانيف المعادلة التي يجب إنشاؤها.\n"
        textup+=u'{| class="wikitable sortable"\n!الرقم!!التصنيف!!مرات الأستخدام\n'
        linken = re.findall(ur'\[\[.*?\]\]',text, re.S)    
        counters=0
        dict={}
        for item in linken:
            if text.lower().count(item.lower())==0:
                continue
            dict[item]=text.count(item)
        counters=0
        for key, value in sorted(dict.iteritems(), key=lambda (k,v): (v,k), reverse=True):    
            counters+=1
            textup+=u'|-\n|'+str(counters)+u'||'+key+u'||'+str(value)+u'\n'
            text=text.replace(item,u'').replace(u'[[]]',u'').replace(u'[[',u'[[:en:')
        textup+=u'|-\n|}'
        savefile='NeededCategories.txt'
        with codecs.open(savefile ,mode = 'w',encoding = 'utf8' ) as f:
                    f.write(textup)
        wikipedia.output(u"\03{lightblue} Needed Category's File is made and it's name is "+unicode(savefile,'UTF-8')+u" \03{default}")
def mainarticles():
        wikipedia.output(u'\03{lightgreen} Listing Needed Articles....\03{default}')
        farsichar=u'ابضصثقفغعهخحجچشسیلتنمکگظطزرذدپو۱۲۳۴۵۶۷۸۹۰'
        filesample = 'NeededBoxes.txt'
        text2 = codecs.open( filesample,'r' ,'utf8' )
        text = text2.read()
        linken =re.findall(ur'\[\[.*?\]\]',text, re.S)
        enlinks=[]
        for item in linken:
            passport=True
            if not item in text:
                continue
            if item.find(u'File:')!=-1 or item.find(u'file:')!=-1 or item.find(u'Image:')!=-1 or item.find(u'image:')!=-1 or item.find(u'Category:')!=-1 or item.find(u'category:')!=-1:
                continue
            for i in farsichar:
                    if i in item:
                        passport=False
                        break
            if passport:        
                itemnew=item.split(u'|')[0].replace(u'[[',u'').replace(u']]',u'').replace(u':en:',u'').replace(u'en:',u'').strip()
                redirect=redirectquery(itemnew,'en')
                if redirect:
                    enlink=u'AAA'+redirect+u'$$$'
                else:
                    continue    
                text=text.replace(item,enlink)
                linken =re.findall(ur'\[\[.*?\]\]',text, re.S)
                wikipedia.output(item)
                wikipedia.output(str(len(linken))+u' is remained')                
        text=text.replace(u'AAA',u'[[').replace(u'$$$',u']]')
        linken =re.findall(ur'\[\[.*?\]\]',text, re.S)
        for item in linken:
                passport=True    
                itemmain=item
                item=item.replace(u':en:',u'').replace(u'en:',u'')
                item=item.split(u'|')[0].replace(u'[[',u'').replace(u']]',u'').strip()
                for i in farsichar:
                    if i in item:
                        passport=False
                        break
                if item.find(u'File:')!=-1 or item.find(u'file:')!=-1 or item.find(u'Image:')!=-1 or item.find(u'image:')!=-1 or item.find(u'Category:')!=-1 or item.find(u'category:')!=-1:
                    continue
                if text.find(u'[['+item)!=-1 and passport:            
                    if not item in enlinks:      
                        enlinks.append(item)    
        textup=u"المقالات يجب إنشاؤها كي لاتبقى وصلة حمراء في المقالات.\n"
        textup+=u'{|class="wikitable sortable"\n!ردیف!!نام مقاله!!تعداد استفاده\n'
        dict={}
        for item in enlinks:
            if text.count(u'[['+item)==0:
                continue    
            dict[item]=text.count(u'[['+item)
        counters=0
        for key, value in sorted(dict.iteritems(), key=lambda (k,v): (v,k), reverse=True):    
            counters+=1
            textup+=u'|-\n|'+str(counters)+u'||[[:en:'+key+u']]||'+str(value)+u'\n'
        textup+=u'|-\n|}'
        savefile='NeededArticles.txt'
        with codecs.open( savefile,mode = 'w',encoding = 'utf8' ) as f:
                    f.write( textup )
        wikipedia.output(u"\03{lightblue} Needed Article's File is made and it's name is "+unicode(savefile,'UTF-8')+u" \03{default}")
def exceleporter():
    wikipedia.output(u'\03{lightgreen} Making excel File.... \03{default}')
    count = 0    
    filesample = 'NeededBoxes.txt'
    text2 = codecs.open( filesample,'r' ,'utf8' )
    text = text2.read()
    #-------------------------------------------------------
    #قسم الخانات
    # يجب كتابة خانات قالب المعلومات التي يجب استخراجها
    #items=(u'number',u'organ',u'date',u'year',u'meeting',u'code',u'document',u'for',u'abstention',u'against',u'subject',u'result',u'image',u'caption')
    #items=(u'name',u'image',u'caption',u'fullname',u'birth_date',u'birth_place',u'death_date',u'death_place',u'height',u'position',u'currentclub',u'youthyears1',u'youthclubs1',u'collegeyears1',u'collegeclubs1',u'collegecaps1',u'collegegoals1',u'years1',u'clubs1',u'caps1',u'goals1',u'totalcaps',u'totalgoals',u'nationalyears1',u'nationalteam1',u'nationalcaps1',u'nationalgoals1',u'medaltemplates',u'manageryears1',u'managerclubs1',u'club-update',u'nationalteam-update')
    #items=(u'name',u'official_name',u'image',u'image_size',u'image_caption',u'image_alt',u'location_map',u'location_map_width',u'location_map_text',u'lat_d',u'lat_m',u'lat_s',u'lat_NS',u'long_d',u'long_m',u'long_s',u'long_EW',u'coordinates_type',u'coordinates_display',u'coordinates_ref',u'country',u'location',u'status',u'construction_began',u'commissioned',u'licence_expires',u'decommissioned',u'cost',u'owner',u'operator',u'developer',u'constructor',u'reactors_operate_mw',u'reactors_const_mw',u'reactors_planned_mw',u'reactors_decom_mw',u'reactors_cancel_mw',u'reactor_type',u'reactor_supplier',u'turbine_manu_npp',u'installed_capacity',u'max_planned_cap',u'capacity_factor',u'average_annual_gen',u'net_generation',u'website',u'as_of',u'extra')
    #items=(u'name ',u'image_skyline',u'image_map',u'latd',u'longd',u'area_total_km2',u'population_total',u'population_as_of',u'population_density_km2',u'timezone1',u'postal_code',u'website',u'image') #City
    items=(u'number',u'award',u'image',u'caption',u'date',u'site',u'host',u'producer',u'director',u'best_picture',u'most_wins',u'most_nominations',u'network',u'duration',u'ratings',u'last',u'next')
    #-------------------------------------------------------
    for a in range(0,30):
        text=text.replace(u' =',u'=').replace(u'| ',u'|').replace(u'= ',u'=')
    text=text.replace(u'@@@\n\n@@@',u'@@@').replace(u'\t',u'')
    dict={}
    count=-1
    text=re.sub(ur'\<ref(.*?)\/ref\>',ur"", text, re.S)
    text=re.sub(ur'\<ref(.*?)\/>',ur"", text, re.S)
    count=-1
    for item in items:    
        count+=1
        dict[count]=[item]
    dict[count+1]=[u'names']
    for pag in text.split(u'@@@' ):
        pag=pag.replace(u'\r',u'')
        if pag.strip()==u'':
            continue
        try:    
           onvan=pag.split("'''")[1].strip()
        except:
               continue
        count=-1
        for item in items:
            count+=1
            try:
                im = re.search(ur'\|'+item+u'\=.*?\\n\|', pag)
                itema=im.group(0).split(item+u'=')[1].replace(u'\n|',u'').replace(u'\n',u'').replace(u"''",u"").strip()
                itema=itema.split(u'<')[0].strip()
                if itema==u'N/A' or itema==u'*' or itema==u'':
                    itema=u' '  
            except:
                itema=u' '
            dict[count].append(itema)
        onvan=pag.split("'''")[1].replace(u'en:',u'').strip()
        dict[count+1].append(onvan)
    number=len(items)+1
    total=len(dict[0])-1
    text=u'\n'
    for b in range(0,total):
        for a in range(0,number):
            text+=dict[a][b]+u'\t'    
        text+=u'\n'
    savefile='resultexcelc.txt'
    with codecs.open(savefile ,mode = 'w',encoding = 'utf8' ) as f:
                    f.write( text.strip() )
    f.close() 
    wikipedia.output(u"\03{lightblue} Excel File is made and it's name is "+unicode(savefile,'UTF-8')+u"\03{default}")
if __name__ == "__main__":
    try:
        main(savewiki,adressfa,nofa)
    except:
        pass
    linktranslation()
    exceleporter()
    catenlist()
    mainarticles()